CUDA内存拷贝
1、cudaMemcpy()<--> cudaMalloc() //线性内存拷贝
1 //线性内存拷贝 2 cudaMalloc((void**)&dev_A, data_size); 3 cudaMemcpy(dev_A, host_A, data_size, cudaMemcpyHostToDevice);
2、cudaMemcpy2D()<-->cudaMallocPitch() //线性内存拷贝
cudaError_t cudaMemcpy2D( void * dst, size_t dpitch, const void * src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind )
例:
1 cudaMallocPitch((void**)&devPtr, &pitch, width * sizeof(float), height); 2 cudaMemcpy2D( void* dst,size_t dpitch,const void* src,size_t spitch,size_t width,size_t height,enum cudaMemcpyKind kind )
3、cudaMemcpy2DToArray()<-->cudaMallocArray() //(二维)线性内存到2维数组的拷贝
1 cudaError_t cudaMemcpy2DToArray ( 2 struct cudaArray * dst, 3 size_t wOffset, 4 size_t hOffset, 5 const void * src, 6 size_t spitch, 7 size_t width, 8 size_t height, 9 enum cudaMemcpyKind kind 10 )
例:
1 void mv(float *y, float *A, float *x, int m, int n) 2 { 3 int blkNum = (m >> 4) + ((m & 15) ? 1 : 0); 4 int height = blkNum << 4; 5 int width = (n & 255) ? (((n >> 8) + 1) << 8) : n; 6 dim3 threads(16, 16); 7 dim3 grid(blkNum, 1); 8 cudaArray *d_A; 9 float *d_x, *d_y; 10 11 cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float4>(); 12 cudaMallocArray(&d_A, &channelDesc, width >> 2, height); 13 cudaMemcpy2DToArray(d_A, 0, 0, A, n * sizeof(float), n * sizeof(float), m, cudaMemcpyHostToDevice); 14 cudaBindTextureToArray(texRefA, d_A); 15 cudaMalloc((void **) &d_x, n * sizeof(float)); 16 cudaMalloc((void **) &d_y, m * sizeof(float)); 17 18 cudaMemcpy(d_x, x, n * sizeof(float), cudaMemcpyHostToDevice); 19 mv_kernel<<< grid, threads >>>(d_y, d_A, d_x, m, n); 20 cudaMemcpy(y, d_y, m * sizeof(float), cudaMemcpyDeviceToHost); 21 22 cudaFree(d_y); 23 cudaFree(d_x); 24 cudaUnbindTexture(texRefA); 25 cudaFreeArray(d_A); 26 }
4、cudaMemcpyToArray()<-->cudaMallocArray() //(1维)线性内存到2维数组的拷贝
1 cudaError_t cudaMemcpyToArray( 2 struct cudaArray * dst, 3 size_t wOffset, 4 size_t hOffset, 5 const void * src, 6 size_t count, 7 enum cudaMemcpyKind kind 8 )
例:
1 void initCudaTexture(float *h_volume, float2 *velocity) 2 { 3 cudaChannelFormatDesc desc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat); 4 5 cudaMallocArray(&d_volumeArray, &desc, 128, 128); 6 7 cudaMemcpyToArray(d_volumeArray, 0, 0, h_volume, sizeof(float)*128*128, cudaMemcpyDeviceToDevice); 8 9 tex.normalized = true; 10 tex.filterMode = cudaFilterModeLinear; 11 tex.addressMode[0] = cudaAddressModeWrap; 12 tex.addressMode[1] = cudaAddressModeWrap; 13 14 cutilSafeCall(cudaBindTextureToArray(tex, d_volumeArray)); 15 16 }
5、cudaMemcpy3D()<-->cudaMalloc3DArray() //(1维)线性内存到3维数组的拷贝
1 cudaError_t cudaMemcpy3D(const struct cudaMemcpy3DParms * p) 2 3 struct cudaExtent { 4 size_t width; 5 size_t height; 6 size_t depth; 7 }; 8 struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d); 9 10 struct cudaPos { 11 size_t x; 12 size_t y; 13 size_t z; 14 }; 15 struct cudaPos make_cudaPos(size_t x, size_t y, size_t z); 16 17 struct cudaMemcpy3DParms { 18 struct cudaArray *srcArray; 19 struct cudaPos srcPos; 20 struct cudaPitchedPtr srcPtr; 21 struct cudaArray *dstArray; 22 struct cudaPos dstPos; 23 struct cudaPitchedPtr dstPtr; 24 struct cudaExtent extent; 25 enum cudaMemcpyKind kind; 26 };
例:
1 void initCudaTexture(const uchar *h_volume, cudaExtent volumeSize) 2 { 3 cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>(); 4 5 cutilSafeCall(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize)); 6 7 cudaMemcpy3DParms copyParams = {0}; 8 copyParams.srcPtr = make_cudaPitchedPtr((void*)h_volume, volumeSize.width*sizeof(uchar), volumeSize.width, volumeSize.height); 9 copyParams.dstArray = d_volumeArray; 10 copyParams.extent = volumeSize; 11 copyParams.kind = cudaMemcpyHostToDevice; 12 cutilSafeCall(cudaMemcpy3D(©Params)); 13 14 tex.normalized = true; 15 tex.filterMode = cudaFilterModeLinear; 16 tex.addressMode[0] = cudaAddressModeWrap; 17 tex.addressMode[1] = cudaAddressModeWrap; 18 tex.addressMode[2] = cudaAddressModeWrap; 19 20 cutilSafeCall(cudaBindTextureToArray(tex, d_volumeArray, channelDesc)); 21 }
6、cudaMemcpyToSymbol() //拷贝到常数存储器
1 __constant__ float constData[256]; 2 float data[256]; 3 cudaMemcpyToSymbol(constData, data, sizeof(data)); 4 cudaMemcpyFromSymbol(data, constData, sizeof(data)); 5 __device__ float devData; float value = 3.14f; 6 cudaMemcpyToSymbol(devData, &value, sizeof(float)); 7 __device__ float* devPointer; float* ptr; 8 cudaMalloc(&ptr, 256 * sizeof(float)); 9 cudaMemcpyToSymbol(devPointer, &ptr, sizeof(ptr));
stream中的内存拷贝
因为硬件中并没有流的概念,内存复制操作在硬件是是必须排队的,所以可以调整一下数据的copy顺序来保证程序的并发执行,内存操作上需要对内存进行异步操作复制。
简单可以理解为:cudaMemcpy是同步的,而cudaMemcpyAsync是异步的。具体理解需要弄清以下概念:
1.CUDA Streams
在cuda中一个Stream是由主机代码发布的一系列再设备上执行的操作,必须确保顺序执行。不同streams里面的操作可以交叉执行或者并发执行。
2.默认stream
设备操作包括:数据传输和kernels,在cuda中,所有的设备操作都在stream中执行。当没有指定stream时,使用默认的stream。默认stream是一个针对设备操作同步的stream,也就是说,只有当所有之前设备上任何stream里面的操作全部完成时,才开始默认stream里面操作的执行,并且默认stream里面的一个操作必须完成,其他任何stream里面的操作才能开始。
例如以下代码:
cudaMemcpy(d_a, a, numBytes, cudaMemcpyHostToDevice); increment<<<1,N>>>(d_a) cudaMemcpy(a, d_a, numBytes, cudaMemcpyDeviceToHost);
从设备端来看,这三个操作都在默认stream中,并且按顺序执行;从主机端来看,数据传输是阻塞的或者同步传输,而kernel是异步的。第一步主机到设备的数据传输是同步的,CPU线程不能到达第二行直到主机到设备的数据传输完成。一旦kernel被处理,CPU线程移到第三行,但是改行的传输不能开始,因为设备端正在执行第二行的内容。
3.非默认stream
非默认stream中的数据传输使用函数cudaMemcpyAsync(),这个函数在主机端是非阻塞的,传输处理后控制权马上返回给主机线程,解决数据依赖关系进行单程序多数据的处理。
例如以下代码:
其中,N为DATASIZE中的一部分数据,可以进行for循环,依此取出N个数据进行计算。多个kernel并发依此类推进行类存的复制,保证一个kernel计算的时候,另外一个kernel的数据在进行传输,不会停止等待。cudaHostAlloc( (void**)&host_a,DATA_SIZE * sizeof(int),cudaHostAllocDefault ) ; cudaMalloc( (void**)&dev_a0,N * sizeof(int) ) ; cudaMemcpyAsync( dev_a0, host_a,N * sizeof(int),cudaMemcpyHostToDevice,stream0 ) ;
参考:http://www.cnblogs.com/traceorigin/archive/2013/04/12/3016568.html
http://www.cnblogs.com/shrimp-can/p/5231857.html