Converting a C for-loop to a CUDA for-loop

cudagpunvidiaperformance

I have this low level for loop I've written in C that a friend suggested I write in CUDA. I've set up my CUDA enviroment and have been looking at the docs, but i'm still struggling with the syntax for what's been well over 2 weeks now. Can anyone help me out? What would this look like in CUDA?

float* red = new float [N];
float* green = new float [N];
float* blue = new float [N];

for (int y = 0; y < h; y++)
{
    // Get row ptr from the color image
    const unsigned char* src = rowptr<unsigned char>(color, 0, y, w);

    // Get row ptrs for the destination channel features
    float* rptr = rowptr<float>(red, 0, y, w);
    float* gptr = rowptr<float>(green, 0, y, w);
    float* bptr = rowptr<float>(blue, 0, y, w);

    for (int x = 0; x < w; x++)
    {
        *rptr++ = (float)*src++;
        *gptr++ = (float)*src++;
        *bptr++ = (float)*src++;
    }
}

Best Answer

Here is some sample code. I don't know if it will really answer your questions. Probably you will need to learn more about CUDA. If you can spare the time, taking this webinar and this webinar from the nvidia webinar page would be 2 hours well spent. Also the cuda C programmers manual is a good readable reference.

#include <stdio.h>

#define N      256
#define NUMROW   N
#define NUMCOL   N
#define PIXSIZE  3
#define REDOFF   0
#define GREENOFF 1
#define BLUEOFF  2
#define nTPB    16
#define GRNVAL   5
#define REDVAL   7
#define BLUVAL   9

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

__global__ void kern(const unsigned numrow, const unsigned numcol, const unsigned char* src, float* rptr, float* gptr, float* bptr){

  unsigned idx = threadIdx.x + (blockDim.x*blockIdx.x);
  unsigned idy = threadIdx.y + (blockDim.y*blockIdx.y);
  if ((idx < numcol) && (idy < numrow)){

    rptr[(idy*numcol)+idx] = (float)src[(((idy*numcol)+idx)*PIXSIZE)+REDOFF];
    gptr[(idy*numcol)+idx] = (float)src[(((idy*numcol)+idx)*PIXSIZE)+GREENOFF];
    bptr[(idy*numcol)+idx] = (float)src[(((idy*numcol)+idx)*PIXSIZE)+BLUEOFF];
    }
}

int main (){

  float *h_red, *h_green, *h_blue;
  float *d_red, *d_green, *d_blue;
  unsigned char *h_img, *d_img;

  if ((h_img =(unsigned char*)malloc(NUMROW*NUMCOL*PIXSIZE*sizeof(unsigned char))) == 0) {printf("malloc fail\n"); return 1;}
  if ((h_red =(float*)malloc(NUMROW*NUMCOL*sizeof(float))) == 0) {printf("malloc fail\n"); return 1;}
  if ((h_green =(float*)malloc(NUMROW*NUMCOL*sizeof(float))) == 0) {printf("malloc fail\n"); return 1;}
  if ((h_blue =(float*)malloc(NUMROW*NUMCOL*sizeof(float))) == 0) {printf("malloc fail\n"); return 1;}

  cudaMalloc((void **)&d_img, (NUMROW*NUMCOL*PIXSIZE)*sizeof(unsigned char));
  cudaCheckErrors("cudaMalloc1 fail");
  cudaMalloc((void **)&d_red, (NUMROW*NUMCOL)*sizeof(float));
  cudaCheckErrors("cudaMalloc2 fail");
  cudaMalloc((void **)&d_green, (NUMROW*NUMCOL)*sizeof(float));
  cudaCheckErrors("cudaMalloc3 fail");
  cudaMalloc((void **)&d_blue, (NUMROW*NUMCOL)*sizeof(float));
  cudaCheckErrors("cudaMalloc4 fail");

  for (int i=0; i<NUMROW*NUMCOL; i++){
    h_img[(i*PIXSIZE)+ REDOFF]   = REDVAL;
    h_img[(i*PIXSIZE)+ GREENOFF] = GRNVAL;
    h_img[(i*PIXSIZE)+ BLUEOFF]  = BLUVAL;
    }

  cudaMemcpy(d_img, h_img, (NUMROW*NUMCOL*PIXSIZE)*sizeof(unsigned char), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMemcpy1 fail");

  dim3 block(nTPB, nTPB);
  dim3 grid(((NUMCOL+nTPB-1)/nTPB),((NUMROW+nTPB-1)/nTPB));
  kern<<<grid,block>>>(NUMROW, NUMCOL, d_img, d_red, d_green, d_blue);
  cudaMemcpy(h_red, d_red, (NUMROW*NUMCOL)*sizeof(float), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy2 fail");
  cudaMemcpy(h_green, d_green, (NUMROW*NUMCOL)*sizeof(float), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy3 fail");
  cudaMemcpy(h_blue, d_blue, (NUMROW*NUMCOL)*sizeof(float), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy4 fail");

  for (int i=0; i<(NUMROW*NUMCOL); i++){
    if (h_red[i] != REDVAL) {printf("Red mismatch at offset %d\n", i); return 1;}
    if (h_green[i] != GRNVAL) {printf("Green mismatch at offset %d\n", i); return 1;}
    if (h_blue[i] != BLUVAL) {printf("Blue mismatch at offset %d\n", i); return 1;}
    }
  printf("Success!\n");
  return 0;
}

In response to a question posed in the comments, here is a modified kernel that shows how to use the rowptr<> template as defined in the comments. Just replace the kernel code above with this:

template <typename T> T* rowptr(T* start, int x, int y, int w) __device__ __host__ { return start + y*w + x; }

__global__ void kern(const unsigned numrow, const unsigned numcol, unsigned char* isrc, float* rptr, float* gptr, float* bptr){


  unsigned idx = threadIdx.x + (blockDim.x*blockIdx.x);
  unsigned idy = threadIdx.y + (blockDim.y*blockIdx.y);
  if ((idx < numcol) && (idy < numrow)){
    unsigned char *src = rowptr<unsigned char>(isrc, (idx*PIXSIZE), idy, (numcol*PIXSIZE));

    rptr[(idy*numcol)+idx] = (float)*src++;
    gptr[(idy*numcol)+idx] = (float)*src++;
    bptr[(idy*numcol)+idx] = (float)*src;
    }
}

Related Solutions

Javascript – Which “href” value should I use for JavaScript links, “#” or “javascript:void(0)”

I use javascript:void(0).

Three reasons. Encouraging the use of # amongst a team of developers inevitably leads to some using the return value of the function called like this:

function doSomething() {
    //Some code
    return false;
}

But then they forget to use return doSomething() in the onclick and just use doSomething().

A second reason for avoiding # is that the final return false; will not execute if the called function throws an error. Hence the developers have to also remember to handle any error appropriately in the called function.

A third reason is that there are cases where the onclick event property is assigned dynamically. I prefer to be able to call a function or assign it dynamically without having to code the function specifically for one method of attachment or another. Hence my onclick (or on anything) in HTML markup look like this:

onclick="someFunc.call(this)"

onclick="someFunc.apply(this, arguments)"

Using javascript:void(0) avoids all of the above headaches, and I haven't found any examples of a downside.

So if you're a lone developer then you can clearly make your own choice, but if you work as a team you have to either state:

Use href="#", make sure onclick always contains return false; at the end, that any called function does not throw an error and if you attach a function dynamically to the onclick property make sure that as well as not throwing an error it returns false.

Use href="javascript:void(0)"

The second is clearly much easier to communicate.

Compiling CUDA with Visual Studio 2010

You'll find a complete guide here:

How to Run CUDA In Visual Studio 2010

Best Answer

Related Solutions

Javascript – Which “href” value should I use for JavaScript links, “#” or “javascript:void(0)”

Compiling CUDA with Visual Studio 2010

Related Topic