cuda计算矩阵的乘法

最新推荐文章于 2024-04-29 13:34:21 发布

Rainbow Sea

最新推荐文章于 2024-04-29 13:34:21 发布

阅读量372

点赞数 8

文章标签：矩阵线性代数

本文链接：https://blog.csdn.net/qq_46463876/article/details/134864324

版权

#include "device_launch_parameters.h"
#include <iostream>
#include <cuda_runtime.h>
#include <cstdint>
#include <opencv2/opencv.hpp>
#include "freshman.h"

//有点蒙蔽为什么是while难道if不行吗
//实现矩阵乘法（10，32）（32，25600）
//使用二位线程块和共享内存的方式加速计算
__global__ void MatMulKernel1D(float* C, float* A, float* B, const int wh, const int wC, const int hC)
{
    const int totalSize = wC * hC;
    //索引计算出了问题
    int ny = threadIdx.y + blockDim.y * blockIdx.y;
    int nx = threadIdx.x + blockIdx.x * blockDim.x; 
    int thID = ny * hC + nx;
    if (thID < totalSize) {
        int Cx = thID / wC;             //数据坐标 与 thread索引的映射
        int Cy = thID % wC;
        float rst = 0.0;
        for (int i = 0; i < wh; i++) {
            rst += A[Cx * wh + i] * B[i * wC + Cy];
        }
        C[Cx * wC + Cy] = rst;
        //thID += gridDim.x * blockDim.x;
    }
}

void matrixMulCPU(float* C, const float* A, const float* B, unsigned int wA,
    unsigned int wC, unsigned int hC) {
    unsigned int hA = hC;
    unsigned int wB = wC;
    for (unsigned int i = 0; i < hA; ++i)
        for (unsigned int j = 0; j < wB; ++j) {
            float sum = 0;
            for (unsigned int k = 0; k < wA; ++k) {
                sum += A[i * wA + k] * B[k * wB + j];
            }
            C[i * wB + j] = (float)sum;
        }
}

int main() {
    //set up device
    int dev = 0;
    cudaDeviceProp deviceprop;
    CHECK(cudaGetDeviceProperties(&deviceprop, dev));
    printf("Using Device : %d: %s \n", dev, deviceprop.name);
    CHECK(cudaSetDevice(dev));

    //set up data  size of matrix
    int hc = 10;
    int wh = 32;
    int wc = 25600;

    int nA = hc*wh;
    int nAbytes = nA * sizeof(float);
    printf("A Matrix size : h %d w %d \n", hc, wh);

    int nB = wh * wc;
    int nBbytes = nB * sizeof(float);
    printf("B Matrix size : h %d w %d \n", wh, wc);

    int nC = hc * wc;
    int nCbytes = nC * sizeof(float);
    printf("B Matrix size : h %d w %d \n", hc, wc);


    //malloc host memory
    float* a_h, * b_h, * res_h, *res_cpu;
    a_h = (float*)malloc(nAbytes);
    b_h = (float*)malloc(nBbytes);
    res_h = (float*)malloc(nCbytes);
    res_cpu = (float*)malloc(nCbytes);


    //initialize data at host device
    initialData(a_h, nA);
    initialData(b_h, nB);
    printf("initialize data complete! %f, %f", a_h[0], b_h[0]);

    //set 0 at host device for res
    memset(res_h, 0, nCbytes);
    memset(res_cpu, 0, nCbytes);
    printf("set 0 at host device  complete!");

    //add matrix at host device
    double istart = cpuSecond();
    matrixMulCPU(res_cpu,a_h, b_h , wh, wc, hc);
    double iElaps = cpuSecond() - istart;
    printf("CPU Exection time : %f \n", iElaps);

    //malloc device memory 
    float* a_d, * b_d, * c_d;
    cudaMalloc((void**)&(a_d), nAbytes);
    cudaMalloc((void**)&(b_d), nBbytes);
    cudaMalloc((void**)&(c_d), nCbytes);

    //transfer data from host to device
    cudaMemcpy(a_d, a_h, nAbytes, cudaMemcpyHostToDevice);
    cudaMemcpy(b_d, b_h, nBbytes, cudaMemcpyHostToDevice);
    
    //ivok kernel at host device
    int dimx = 10;
    int dimy = 64;
    dim3 block(dimx, dimy);
    dim3 grid((hc + block.x - 1) / block.x, (wc + block.y - 1) / block.y);


    istart = cpuSecond();
    MatMulKernel1D <<<grid, block >>> (c_d, a_d, b_d, wh, wc, hc);
    cudaDeviceSynchronize();
   iElaps = cpuSecond() - istart;
    printf("GPU Execution configuration<<<(%d,%d),(%d,%d)>>> Time elapsed %f sec\n",
        grid.x, grid.y, block.x, block.y, iElaps);

    cudaMemcpy(res_h, c_d, nCbytes, cudaMemcpyDeviceToHost);

    //check if cpures and gpures is same
    checkResult(res_h, res_cpu, nC);

    //free device memory
    cudaFree(a_d);
    cudaFree(b_d);
    cudaFree(c_d);

    //free host memory
    free(a_h);
    free(b_h);
    free(res_h);

    //reset device
    cudaDeviceReset();

    return 0;
}