cudaMalloc 分配GPU内存到setDevice指定的设备上
cudaMallocHost 分配page locked memory,也叫做pinned momory,页锁定内存
- 页锁定内存是主机内存,CPU可直接访问
cudaMemcpy 实现Device To Host 或 Host To Device
cudaStreamCreate 创建流,是异步控制的主要手段
int main(){
int device_id = 0;
cudaSetDevice(device_id);
float* memory_device = nullptr;
cudaMalloc(&memory_device, 100 * sizeof(float));
float* momory_host = new float[100];
memory_host[2] = 250.0;
cudaMemcpy(memory_device, memory_host, sizeof(float) * 100, cudaMemcpyHostToDevice);
float* memory_page_locked = nullptr;
cudaMallocHost(&momory_page_locked, 100 * sizeof(float)); // 返回的是开辟的pin memory地址
cudaMemcpy(memory_page_locked, memory_device, sizeof(float) * 100, cudaMemcpyDeviceToHost);
printf("%f\n", memory_page_locked[2]);
cudaFreeHost(memory_page_locked);
delete [] memory_host;
cudaFree(memory_device);
return 0;
}
1、在gpu上开辟空间,将地址记录在mem_device;
2、在cpu上开辟空间,将地址记录在mem_host上,并修改该地址指向区域第二个值;
3、把mem_host所指向区域数据都复制到mem_device的所指区域;
4、在cpu上开辟一块空间,把地址记录在mem_page_locked;
5、把mem_device所指区域的数据复制回cpu上的mem_page_locked。
int main(){
int device_id = 0;
cudaSetDevice(device_id);
cudaStream_t stream = nullptr;
cudaStreamCreate(&stream);
float* memory_device = nullptr;
cudaMalloc(&memory_device, 100 * sizeof(float));
float* memory_host = new float[100];
memory_host[2] = 200.2;
cudaMemcpyAsync(memory_device, memory_host, sizeof(float) * 100, cudaMemcpyHostToDevice, stream);
float* memory_page_locked = nullptr;
cudaMallocHost(& memory_page_locked, 100 * sizeof(float));
cudaMemcpyAsync(memory_page_locked, memory_device, sizeof(float) * 100, cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
printf("%f\n", memory_page_locked[2]);
cudaFreeHost(memory_page_locked);
cudaFree(memory_device);
cudaStreamDestory(stream);
delete [] memory_host;
return 0;
}