总的来说,就是
cudaHostAlloc((void **)&h_A,nBytes,cudaHostAllocMapped);
获取的h_A也是可以当作设备指针用的,不再需要
cudaHostGetDevicePointer((void **)&d_A, (void *)h_A, 0);
来获取设备指针了,测试的代码如下:
#include <cuda_runtime.h>
#include <stdio.h>
__global__ void sumArraysZeroCopy(float *A, float *B, float *C, const int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) C[i] = A[i] + B[i] +1000;
}
void initialData(float *ip, int size)
{
int i;
for (i = 0; i < size; i++)
{
ip[i] = (float)( rand() & 0xFF ) / 10.0f;
}
return;
}
void display(float * f,int num){
for(int i=0;i<num;i++){
printf("%.4f ",f[i]);
}printf("\n");
}
int main(){
int num = 10;
int nBytes = num * sizeof(float);
float *h_A,*h_B,*h_C,*d_A,*d_B,*d_C,*gpuBuf;
gpuBuf = (float *)malloc(sizeof(float)*num);
cudaHostAlloc((void **)&h_A,nBytes,cudaHostAllocMapped);
cudaHostAlloc((void **)&h_B,nBytes,cudaHostAllocMapped);
cudaHostAlloc((void **)&h_C,nBytes,cudaHostAllocMapped);
initialData(h_A,num);
initialData(h_B,num);
for(int i=0;i<num;i++){
h_C[i] = h_A[i] + h_B[i];
}
display(h_C,num);
// cudaHostGetDevicePointer((void **)&d_A, (void *)h_A, 0);
// cudaHostGetDevicePointer((void **)&d_B, (void *)h_B, 0);
// cudaHostGetDevicePointer((void **)&d_C, (void *)h_C, 0);
sumArraysZeroCopy<<<1,num>>>(h_A,h_B,h_C,num+1);
cudaMemcpy(gpuBuf,h_C,sizeof(float)*num, cudaMemcpyDeviceToHost);
// display(gpuBuf,num);
display(h_C,num);
}
结果如下: