申请主机端内存
int nx = 1<<14;
int ny = 1<<14;
int nxy = nx*ny;
int nBytes = nxy*sizeof(float);
//mlloc host global memory
float *h_A, *h_B, *hostRef, *gpuRef;
h_A=(float *)malloc(nBytes);
h_B=(float *)malloc(nBytes);
hostRef = (float *)malloc(nBytes);
gpuRef = (float *)malloc(nBytes);
申请设备端内存
//malloc device gloabl memory
float *d_MatA, *d_MatB, *d_MatC;
cudaMalloc((void **)&d_MatA, nBytes);
cudaMalloc((void **)&d_MatB, nBytes);
cudaMalloc((void **)&d_MatC, nBytes);
将数据拷贝到设备端
//transform data from host to device
cudaMemcpy(d_MatA, h_A, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_MatB, h_B, nBytes, cudaMemcpyHostToDevice);
将数据拷贝到主机端
//copy kernel result back to host side
cudaMemcpy(gpuRef,d_MatC,nBytes,cudaMemcpyDeviceToHost);
内存释放
//free memory
cudaFree(d_MatA);
cudaFree(d_MatB);
cudaFree(d_MatC);
free(h_A);
free(h_B);
free(gpuRef);
free(hostRef);