TensorRT及CUDA自学笔记008 记录矩阵加法计算耗时demo

抢公主的大魔王

已于 2024-04-09 12:57:33 修改

阅读量364

点赞数 8

文章标签： c++ 边缘计算笔记

于 2024-03-02 00:27:59 首次发布

本文链接：https://blog.csdn.net/weixin_43111445/article/details/136408534

版权

TensorRT及CUDA自学笔记008 记录矩阵加法计算耗时demo

cudaDeviceSynchronize函数的功能是等待GPU完成运算任务，如果所提交计算任务失败，就会返回失败原因

#include"common/common.h"

void data_inital(float* data, int N) {
    static bool initialized = false;
    if (!initialized) {
        srand((unsigned)time(nullptr));
        initialized = true;
    }

    // std::cout << "data: ";
    //初始化数据
    for (int i = 0; i < N; i++) {
        data[i] = (float)(rand() % 0xff) / 10.0f;
    //    std::cout << data[i] << " ";
    }
    std::cout << std::endl;
    return;
}

__global__ void add(float* a, float* b,float* c,int N){
    //int threadID = threadIdx.y*blockDim.x+threadIdx.x;
    int elemId = threadIdx.x + blockIdx.x*blockDim.x;
    if(elemId<N){
        //c[threadID] = a[threadID]+b[threadID];
        c[elemId] = a[elemId]+b[elemId];
    }
}

int main(int argc, char** argv){
    int deviceCount {0};
    cudaDeviceProp deviceProp;
    int driverVersion {0};
    int runtimeVersion {0};
    device_information(&deviceCount,&deviceProp,&driverVersion,&runtimeVersion);
    std::cout<<std::endl;
    cudaError_t error = error_check(cudaSetDevice(0),__FILE__,__LINE__);//针对主机线程指定Device，接下来主机中这个线程的后续的cuda平台的所有操作都是针对于这个设备的。
    if(error == cudaSuccess)
    {
        std::cout<<"cudaSetDevice success!"<<std::endl;
        std::cout<<"set on device:"<< deviceProp.name << std::endl;}
    else
    {
        std::cout<<"cudaSetDevice failed!"<<std::endl;
        return -1;}

    int numElem = 1<<14;
    size_t nBytes = numElem * sizeof(float);
    // 初始化主机端数据缓冲区
    float *hostDataA, *hostDataB, *gpuRef;
    hostDataA = (float*)malloc(nBytes);
    hostDataB = (float*)malloc(nBytes);
    gpuRef = (float*)malloc(nBytes);
    if (hostDataA == NULL || hostDataB == NULL || gpuRef == NULL)
    {
        std::cout<<"malloc failed!"<<std::endl;
        return -1;
    }
    
    data_inital(hostDataA,numElem);    //初始化数据
    data_inital(hostDataB,numElem);    //初始化数据
    memset(gpuRef, 0, nBytes);

    // 初始化设备端数据缓冲区
    float *deviceDataA, *deviceDataB, *deviceDataC;
    cudaMalloc((float**)&deviceDataA, nBytes);//注意，cudaMalloc的修饰符为__host____device___,也就是说host和device都可以使用这个cudaAPI函数
    cudaMalloc((float**)&deviceDataB, nBytes);
    cudaMalloc((float**)&deviceDataC, nBytes);
    if (deviceDataA == NULL || deviceDataB == NULL || deviceDataC == NULL){
        std::cout<<"cudaMalloc failed!"<<std::endl;
        free(hostDataA);
        free(hostDataB);
        free(gpuRef);
        return -1;
    }
    if(cudaSuccess ==  cudaMemcpy(deviceDataA,hostDataA,nBytes,cudaMemcpyHostToDevice) &&
     cudaSuccess ==  cudaMemcpy(deviceDataB,hostDataB,nBytes,cudaMemcpyHostToDevice) && 
     cudaSuccess ==  cudaMemcpy(deviceDataC,gpuRef,nBytes,cudaMemcpyHostToDevice)) ///注意，cudaMemcpy的修饰符为__host__,也就是说只有host可以使用这个cudaAPI函数
    {
        std::cout<<"successfully copy data from host to device "<< deviceProp.name <<std::endl;
    }
    else
    {
        std::cout<<"copy data from host to device"<< deviceProp.name <<" failed!" <<std::endl;
        free(hostDataA);
        free(hostDataB);
        free(gpuRef);
        return -1;}
    //加载核函数
    dim3 block(32);
    dim3 grid (numElem/32);
    double timeBegin = GetCPUSecond();
    add<<<grid,block>>>(deviceDataA,deviceDataB,deviceDataC,numElem);
    cudaDeviceSynchronize();
    double timeEnd = GetCPUSecond();
    //将数据从设备端拷贝回主机端
    cudaMemcpy(gpuRef,deviceDataC,nBytes,cudaMemcpyDeviceToHost);
    //打印运算结果
    // std::cout<<"result: ";
    // for(size_t i = 0; i < numElem; i++)
    //     std::cout<<gpuRef[i] << " ";
    // std::cout<<std::endl;
    std::cout<<"result: ";
    for(int i = 0; i < numElem; i++){
        std::cout<< " index: "<< i<< " ";;
        std::cout<<hostDataA[i] << " + ";
        std::cout<<hostDataB[i] << " = ";
        std::cout<<gpuRef[i] << " ";
        std::cout<<std::endl;
    }
    printf("运算个数为:%d,矩阵加法运算时间为:%.5f\n", numElem, timeEnd - timeBegin);

    //释放资源
    free(hostDataA);
    free(hostDataB);
    free(gpuRef);
    cudaFree(deviceDataA);
    cudaFree(deviceDataB);
    cudaFree(deviceDataC);
    cudaDeviceReset();
    return 0;
}

// nvcc main.cu -o main.exe

...
...
...
 index: 16373 14.1 + 23.5 = 37.6 
 index: 16374 6.5 + 0.9 = 7.4 
 index: 16375 0.5 + 19.1 = 19.6 
 index: 16376 18 + 11.2 = 29.2 
 index: 16377 23.9 + 7.2 = 31.1 
 index: 16378 20.7 + 13.8 = 34.5 
 index: 16379 3.7 + 23.6 = 27.3 
 index: 16380 1.9 + 10.2 = 12.1 
 index: 16381 17.5 + 7.2 = 24.7 
 index: 16382 7.9 + 17.5 = 25.4 
 index: 16383 22.1 + 3.4 = 25.5 
运算个数为:16384,矩阵加法运算时间为:0.00045