Converting a C for-loop to a CUDA for-loop

cudagpunvidiaperformance

I have this low level for loop I've written in C that a friend suggested I write in CUDA. I've set up my CUDA enviroment and have been looking at the docs, but i'm still struggling with the syntax for what's been well over 2 weeks now. Can anyone help me out? What would this look like in CUDA?

float* red = new float [N];
float* green = new float [N];
float* blue = new float [N];

for (int y = 0; y < h; y++)
{
    // Get row ptr from the color image
    const unsigned char* src = rowptr<unsigned char>(color, 0, y, w);

    // Get row ptrs for the destination channel features
    float* rptr = rowptr<float>(red, 0, y, w);
    float* gptr = rowptr<float>(green, 0, y, w);
    float* bptr = rowptr<float>(blue, 0, y, w);

    for (int x = 0; x < w; x++)
    {
        *rptr++ = (float)*src++;
        *gptr++ = (float)*src++;
        *bptr++ = (float)*src++;
    }
}

Best Answer

Here is some sample code. I don't know if it will really answer your questions. Probably you will need to learn more about CUDA. If you can spare the time, taking this webinar and this webinar from the nvidia webinar page would be 2 hours well spent. Also the cuda C programmers manual is a good readable reference.

#include <stdio.h>

#define N      256
#define NUMROW   N
#define NUMCOL   N
#define PIXSIZE  3
#define REDOFF   0
#define GREENOFF 1
#define BLUEOFF  2
#define nTPB    16
#define GRNVAL   5
#define REDVAL   7
#define BLUVAL   9

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

__global__ void kern(const unsigned numrow, const unsigned numcol, const unsigned char* src, float* rptr, float* gptr, float* bptr){

  unsigned idx = threadIdx.x + (blockDim.x*blockIdx.x);
  unsigned idy = threadIdx.y + (blockDim.y*blockIdx.y);
  if ((idx < numcol) && (idy < numrow)){

    rptr[(idy*numcol)+idx] = (float)src[(((idy*numcol)+idx)*PIXSIZE)+REDOFF];
    gptr[(idy*numcol)+idx] = (float)src[(((idy*numcol)+idx)*PIXSIZE)+GREENOFF];
    bptr[(idy*numcol)+idx] = (float)src[(((idy*numcol)+idx)*PIXSIZE)+BLUEOFF];
    }
}

int main (){

  float *h_red, *h_green, *h_blue;
  float *d_red, *d_green, *d_blue;
  unsigned char *h_img, *d_img;

  if ((h_img =(unsigned char*)malloc(NUMROW*NUMCOL*PIXSIZE*sizeof(unsigned char))) == 0) {printf("malloc fail\n"); return 1;}
  if ((h_red =(float*)malloc(NUMROW*NUMCOL*sizeof(float))) == 0) {printf("malloc fail\n"); return 1;}
  if ((h_green =(float*)malloc(NUMROW*NUMCOL*sizeof(float))) == 0) {printf("malloc fail\n"); return 1;}
  if ((h_blue =(float*)malloc(NUMROW*NUMCOL*sizeof(float))) == 0) {printf("malloc fail\n"); return 1;}

  cudaMalloc((void **)&d_img, (NUMROW*NUMCOL*PIXSIZE)*sizeof(unsigned char));
  cudaCheckErrors("cudaMalloc1 fail");
  cudaMalloc((void **)&d_red, (NUMROW*NUMCOL)*sizeof(float));
  cudaCheckErrors("cudaMalloc2 fail");
  cudaMalloc((void **)&d_green, (NUMROW*NUMCOL)*sizeof(float));
  cudaCheckErrors("cudaMalloc3 fail");
  cudaMalloc((void **)&d_blue, (NUMROW*NUMCOL)*sizeof(float));
  cudaCheckErrors("cudaMalloc4 fail");

  for (int i=0; i<NUMROW*NUMCOL; i++){
    h_img[(i*PIXSIZE)+ REDOFF]   = REDVAL;
    h_img[(i*PIXSIZE)+ GREENOFF] = GRNVAL;
    h_img[(i*PIXSIZE)+ BLUEOFF]  = BLUVAL;
    }

  cudaMemcpy(d_img, h_img, (NUMROW*NUMCOL*PIXSIZE)*sizeof(unsigned char), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMemcpy1 fail");

  dim3 block(nTPB, nTPB);
  dim3 grid(((NUMCOL+nTPB-1)/nTPB),((NUMROW+nTPB-1)/nTPB));
  kern<<<grid,block>>>(NUMROW, NUMCOL, d_img, d_red, d_green, d_blue);
  cudaMemcpy(h_red, d_red, (NUMROW*NUMCOL)*sizeof(float), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy2 fail");
  cudaMemcpy(h_green, d_green, (NUMROW*NUMCOL)*sizeof(float), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy3 fail");
  cudaMemcpy(h_blue, d_blue, (NUMROW*NUMCOL)*sizeof(float), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy4 fail");

  for (int i=0; i<(NUMROW*NUMCOL); i++){
    if (h_red[i] != REDVAL) {printf("Red mismatch at offset %d\n", i); return 1;}
    if (h_green[i] != GRNVAL) {printf("Green mismatch at offset %d\n", i); return 1;}
    if (h_blue[i] != BLUVAL) {printf("Blue mismatch at offset %d\n", i); return 1;}
    }
  printf("Success!\n");
  return 0;
}

In response to a question posed in the comments, here is a modified kernel that shows how to use the rowptr<> template as defined in the comments. Just replace the kernel code above with this:

template <typename T> T* rowptr(T* start, int x, int y, int w) __device__ __host__ { return start + y*w + x; }

__global__ void kern(const unsigned numrow, const unsigned numcol, unsigned char* isrc, float* rptr, float* gptr, float* bptr){


  unsigned idx = threadIdx.x + (blockDim.x*blockIdx.x);
  unsigned idy = threadIdx.y + (blockDim.y*blockIdx.y);
  if ((idx < numcol) && (idy < numrow)){
    unsigned char *src = rowptr<unsigned char>(isrc, (idx*PIXSIZE), idy, (numcol*PIXSIZE));

    rptr[(idy*numcol)+idx] = (float)*src++;
    gptr[(idy*numcol)+idx] = (float)*src++;
    bptr[(idy*numcol)+idx] = (float)*src;
    }
}