概述
仔细发现,Cuda编程里面内存操作多数都是重复的,那么我们就可以考虑把这些重复的代码封装成类的形式,这样以后使用的时候,直接调用就行了。需要的朋友可以考虑使用这套模板,当然也欢迎在模板的基础上重新制作适合自己的模板。
模板
class CudaMemoryManager{
public:
//GPU分配内存
int* MallocDevice_Int(int length){
int* data;
cudaMalloc((void**)&data, length*sizeof(int));
return data;
}
float* MallocDevice_Float(int length){
float* data;
cudaMalloc((void**)&data, length*sizeof(float));
return data;
}
//CPU分配内存
int* MallocHost_Int(int length){
int* data;
cudaMallocHost((void**)&data, length*sizeof(int));
return data;
}
float* MallocHost_Float(int length){
float* data;
cudaMallocHost((void**)&data, length*sizeof(float));
return data;
}
//GPU释放内存
void FreeDevice_Int(int* data){
cudaFree(data);
}
void FreeDevice_Float(float* data){
cudaFree(data);
}
//CPU释放内存
void FreeHost_Int(int* data){
cudaFreeHost(data);
}
void FreeHost_Float(float* data){
cudaFreeHost(data);
}
//CPU向GPU传输数据
void CopyH2D_Int(int* host, int* device, int size){
cudaMemcpy(device, host, size, cudaMemcpyHostToDevice);
}
void CopyH2D_Float(float* host, float* device, int size){
cudaMemcpy(device, host, size, cudaMemcpyHostToDevice);
}
//GPU向CPU传输数据
void CopyD2H_Int(int* device, int* host, int size){
cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost);
}
void CopyD2H_Float(float* device, float* host, int size){
cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost);
}
};
使用案例
//C++标准库
#include <iostream>
//Cuda运行库
#include <cuda_runtime.h>
using namespace std;
class CudaMemoryManager{
public:
//GPU分配内存
int* MallocDevice_Int(int length){
int* data;
cudaMalloc((void**)&data, length*sizeof(int));
return data;
}
float* MallocDevice_Float(int length){
float* data;
cudaMalloc((void**)&data, length*sizeof(float));
return data;
}
//CPU分配内存
int* MallocHost_Int(int length){
int* data;
cudaMallocHost((void**)&data, length*sizeof(int));
return data;
}
float* MallocHost_Float(int length){
float* data;
cudaMallocHost((void**)&data, length*sizeof(float));
return data;
}
//GPU释放内存
void FreeDevice_Int(int* data){
cudaFree(data);
}
void FreeDevice_Float(float* data){
cudaFree(data);
}
//CPU释放内存
void FreeHost_Int(int* data){
cudaFreeHost(data);
}
void FreeHost_Float(float* data){
cudaFreeHost(data);
}
//CPU向GPU传输数据
void CopyH2D_Int(int* host, int* device, int size){
cudaMemcpy(device, host, size, cudaMemcpyHostToDevice);
}
void CopyH2D_Float(float* host, float* device, int size){
cudaMemcpy(device, host, size, cudaMemcpyHostToDevice);
}
//GPU向CPU传输数据
void CopyD2H_Int(int* device, int* host, int size){
cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost);
}
void CopyD2H_Float(float* device, float* host, int size){
cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost);
}
};
//核函数,GPU执行向量加法
__global__ void add(int* VectorA, int* VectorB, int* VectorC){
int index = threadIdx.x;
VectorC[index] = VectorA[index] + VectorB[index];
}
//主函数,CPU执行数据初始化
int main(){
//CPU中定义三个向量
CudaMemoryManager cudaMemoryManager;
int* Vector_A_Host = cudaMemoryManager.MallocHost_Int(5);
for (int i=0;i<5;i++) Vector_A_Host[i] = i;
int* Vector_B_Host = cudaMemoryManager.MallocHost_Int(5);
for (int i=0;i<5;i++) Vector_B_Host[i] = 1;
int* Vector_C_Host = cudaMemoryManager.MallocHost_Int(5);
for (int i=0;i<5;i++) Vector_C_Host[i] = 0;
//定义GPU中的三个向量
int* Vector_A_Device = cudaMemoryManager.MallocDevice_Int(5);
int* Vector_B_Device = cudaMemoryManager.MallocDevice_Int(5);
int* Vector_C_Device = cudaMemoryManager.MallocDevice_Int(5);
//将数据从CPU复制到GPU
cudaMemoryManager.CopyH2D_Int(Vector_A_Host, Vector_A_Device,5*sizeof(int));
cudaMemoryManager.CopyH2D_Int(Vector_B_Host, Vector_B_Device,5*sizeof(int));
//启用GPU执行任务
add << <1, 5 >> > (Vector_A_Device, Vector_B_Device, Vector_C_Device);
//将GPU中运算结果复制到CPU
cudaMemoryManager.CopyD2H_Int(Vector_C_Device,Vector_C_Host,5*sizeof(int));
//将运算结果打印出来
cout << "运算结果为:" << endl << "(";
for (int i = 0; i < 5; i++) cout << Vector_C_Host[i] << " ";
cout << ")" << endl;
//释放GPU内存
cudaMemoryManager.FreeDevice_Int(Vector_A_Device);
cudaMemoryManager.FreeDevice_Int(Vector_B_Device);
cudaMemoryManager.FreeDevice_Int(Vector_C_Device);
//释放CPU内存
cudaMemoryManager.FreeHost_Int(Vector_A_Host);
cudaMemoryManager.FreeHost_Int(Vector_B_Host);
cudaMemoryManager.FreeHost_Int(Vector_C_Host);
return 0;
}