N equivalent to memcpy() that works inside a CUDA kernel

cuda

I'm trying to break apart and reshape the structure of an array asynchronously using the CUDA kernel. memcpy() doesn't work inside the kernel, and neither does cudaMemcpy()*; I'm at a loss.

Can anyone tell me the preferred method for copying memory from within the CUDA kernel?

It is worth noting, cudaMemcpy(void *to, void *from, size, cudaMemcpyDeviceToDevice) will NOT work for what I am trying to do, because it can only be called from outside of the kernel and does not execute asynchronously.

Best Answer

Yes, there is an equivalent to memcpy that works inside cuda kernels. It is called memcpy. As an example:

__global__ void kernel(int **in, int **out, int len, int N)
{
    int idx = threadIdx.x + blockIdx.x*blockDim.x;

    for(; idx<N; idx+=gridDim.x*blockDim.x)
        memcpy(out[idx], in[idx], sizeof(int)*len);

}

which compiles without error like this:

$ nvcc -Xptxas="-v" -arch=sm_20 -c memcpy.cu 
ptxas info    : Compiling entry function '_Z6kernelPPiS0_ii' for 'sm_20'
ptxas info    : Function properties for _Z6kernelPPiS0_ii
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 11 registers, 48 bytes cmem[0]

and emits PTX:

.version 3.0
.target sm_20
.address_size 32

    .file   1 "/tmp/tmpxft_00000407_00000000-9_memcpy.cpp3.i"
    .file   2 "memcpy.cu"
    .file   3 "/usr/local/cuda/nvvm/ci_include.h"

.entry _Z6kernelPPiS0_ii(
    .param .u32 _Z6kernelPPiS0_ii_param_0,
    .param .u32 _Z6kernelPPiS0_ii_param_1,
    .param .u32 _Z6kernelPPiS0_ii_param_2,
    .param .u32 _Z6kernelPPiS0_ii_param_3
)
{
    .reg .pred  %p<4>;
    .reg .s32   %r<32>;
    .reg .s16   %rc<2>;


    ld.param.u32    %r15, [_Z6kernelPPiS0_ii_param_0];
    ld.param.u32    %r16, [_Z6kernelPPiS0_ii_param_1];
    ld.param.u32    %r2, [_Z6kernelPPiS0_ii_param_3];
    cvta.to.global.u32  %r3, %r15;
    cvta.to.global.u32  %r4, %r16;
    .loc 2 4 1
    mov.u32     %r5, %ntid.x;
    mov.u32     %r17, %ctaid.x;
    mov.u32     %r18, %tid.x;
    mad.lo.s32  %r30, %r5, %r17, %r18;
    .loc 2 6 1
    setp.ge.s32     %p1, %r30, %r2;
    @%p1 bra    BB0_5;

    ld.param.u32    %r26, [_Z6kernelPPiS0_ii_param_2];
    shl.b32     %r7, %r26, 2;
    .loc 2 6 54
    mov.u32     %r19, %nctaid.x;
    .loc 2 4 1
    mov.u32     %r29, %ntid.x;
    .loc 2 6 54
    mul.lo.s32  %r8, %r29, %r19;

BB0_2:
    .loc 2 7 1
    shl.b32     %r21, %r30, 2;
    add.s32     %r22, %r4, %r21;
    ld.global.u32   %r11, [%r22];
    add.s32     %r23, %r3, %r21;
    ld.global.u32   %r10, [%r23];
    mov.u32     %r31, 0;

BB0_3:
    add.s32     %r24, %r10, %r31;
    ld.u8   %rc1, [%r24];
    add.s32     %r25, %r11, %r31;
    st.u8   [%r25], %rc1;
    add.s32     %r31, %r31, 1;
    setp.lt.u32     %p2, %r31, %r7;
    @%p2 bra    BB0_3;

    .loc 2 6 54
    add.s32     %r30, %r8, %r30;
    ld.param.u32    %r27, [_Z6kernelPPiS0_ii_param_3];
    .loc 2 6 1
    setp.lt.s32     %p3, %r30, %r27;
    @%p3 bra    BB0_2;

BB0_5:
    .loc 2 9 2
    ret;
}

The code block at BB0_3 is a byte sized memcpy loop emitted automagically by the compiler. It might not be a great idea from a performance point-of-view to use it, but it is fully supported (and has been for a long time on all architectures).


Edited four years later to add that since the device side runtime API was released as part of the CUDA 6 release cycle, it is also possible to directly call something like

cudaMemcpyAsync(void *to, void *from, size, cudaMemcpyDeviceToDevice)

in device code for all architectures which support it (Compute Capability 3.5 and newer hardware using separate compilation and device linking).

Related Topic