TensorRT及CUDA自学笔记008 记录矩阵加法计算耗时demo
cudaDeviceSynchronize函数的功能是等待GPU完成运算任务,如果所提交计算任务失败,就会返回失败原因
#include"common/common.h"
void data_inital(float* data, int N) {
static bool initialized = false;
if (!initialized) {
srand((unsigned)time(nullptr));
initialized = true;
}
// std::cout << "data: ";
//初始化数据
for (int i = 0; i < N; i++) {
data[i] = (float)(rand() % 0xff) / 10.0f;
// std::cout << data[i] << " ";
}
std::cout << std::endl;
return;
}
__global__ void add(float* a, float* b,float* c,int N){
//int threadID = threadIdx.y*blockDim.x+threadIdx.x;
int elemId = threadIdx.x + blockIdx.x*blockDim.x;
if(elemId<N){
//c[threadID] = a[threadID]+b[threadID];
c[elemId] = a[elemId]+b[elemId];
}
}
int main(int argc, char** argv){
int deviceCount {0};
cudaDeviceProp deviceProp;
int driverVersion {0};
int runtimeVersion {0};
device_information(&deviceCount,&deviceProp,&driverVersion,&runtimeVersion);
std::cout<<std::endl;
cudaError_t error = error_check(cudaSetDevice(0),__FILE__,__LINE__);//针对主机线程指定Device,接下来主机中这个线程的后续的cuda平台的所有操作都是针对于这个设备的。
if(error == cudaSuccess)
{
std::cout<<"cudaSetDevice success!"<<std::endl;
std::cout<<"set on device:"<< deviceProp.name << std::endl;}
else
{
std::cout<<"cudaSetDevice failed!"<<std::endl;
return -1;}
int numElem = 1<<14;
size_t nBytes = numElem * sizeof(float);
// 初始化主机端数据缓冲区
float *hostDataA, *hostDataB, *gpuRef;
hostDataA = (float*)malloc(nBytes);
hostDataB = (float*)malloc(nBytes);
gpuRef = (float*)malloc(nBytes);
if (hostDataA == NULL || hostDataB == NULL || gpuRef == NULL)
{
std::cout<<"malloc failed!"<<std::endl;
return -1;
}
data_inital(hostDataA,numElem); //初始化数据
data_inital(hostDataB,numElem); //初始化数据
memset(gpuRef, 0, nBytes);
// 初始化设备端数据缓冲区
float *deviceDataA, *deviceDataB, *deviceDataC;
cudaMalloc((float**)&deviceDataA, nBytes);//注意,cudaMalloc的修饰符为__host____device___,也就是说host和device都可以使用这个cudaAPI函数
cudaMalloc((float**)&deviceDataB, nBytes);
cudaMalloc((float**)&deviceDataC, nBytes);
if (deviceDataA == NULL || deviceDataB == NULL || deviceDataC == NULL){
std::cout<<"cudaMalloc failed!"<<std::endl;
free(hostDataA);
free(hostDataB);
free(gpuRef);
return -1;
}
if(cudaSuccess == cudaMemcpy(deviceDataA,hostDataA,nBytes,cudaMemcpyHostToDevice) &&
cudaSuccess == cudaMemcpy(deviceDataB,hostDataB,nBytes,cudaMemcpyHostToDevice) &&
cudaSuccess == cudaMemcpy(deviceDataC,gpuRef,nBytes,cudaMemcpyHostToDevice)) ///注意,cudaMemcpy的修饰符为__host__,也就是说只有host可以使用这个cudaAPI函数
{
std::cout<<"successfully copy data from host to device "<< deviceProp.name <<std::endl;
}
else
{
std::cout<<"copy data from host to device"<< deviceProp.name <<" failed!" <<std::endl;
free(hostDataA);
free(hostDataB);
free(gpuRef);
return -1;}
//加载核函数
dim3 block(32);
dim3 grid (numElem/32);
double timeBegin = GetCPUSecond();
add<<<grid,block>>>(deviceDataA,deviceDataB,deviceDataC,numElem);
cudaDeviceSynchronize();
double timeEnd = GetCPUSecond();
//将数据从设备端拷贝回主机端
cudaMemcpy(gpuRef,deviceDataC,nBytes,cudaMemcpyDeviceToHost);
//打印运算结果
// std::cout<<"result: ";
// for(size_t i = 0; i < numElem; i++)
// std::cout<<gpuRef[i] << " ";
// std::cout<<std::endl;
std::cout<<"result: ";
for(int i = 0; i < numElem; i++){
std::cout<< " index: "<< i<< " ";;
std::cout<<hostDataA[i] << " + ";
std::cout<<hostDataB[i] << " = ";
std::cout<<gpuRef[i] << " ";
std::cout<<std::endl;
}
printf("运算个数为:%d,矩阵加法运算时间为:%.5f\n", numElem, timeEnd - timeBegin);
//释放资源
free(hostDataA);
free(hostDataB);
free(gpuRef);
cudaFree(deviceDataA);
cudaFree(deviceDataB);
cudaFree(deviceDataC);
cudaDeviceReset();
return 0;
}
// nvcc main.cu -o main.exe
...
...
...
index: 16373 14.1 + 23.5 = 37.6
index: 16374 6.5 + 0.9 = 7.4
index: 16375 0.5 + 19.1 = 19.6
index: 16376 18 + 11.2 = 29.2
index: 16377 23.9 + 7.2 = 31.1
index: 16378 20.7 + 13.8 = 34.5
index: 16379 3.7 + 23.6 = 27.3
index: 16380 1.9 + 10.2 = 12.1
index: 16381 17.5 + 7.2 = 24.7
index: 16382 7.9 + 17.5 = 25.4
index: 16383 22.1 + 3.4 = 25.5
运算个数为:16384,矩阵加法运算时间为:0.00045