cuda 矩阵乘法,从最容易理解到算得最快(第一版源码)

C是行主序,A,B是列主序,一维化地存储在数组里,并copy到GPU的全局内存中。

一.  最容易理解的实现

        一解释就能让人理解的代码实现,也是一项挺难的任务^^。

尝试实现如下:

设置grid为一维

即gridDim.y=gridDim.z=1 默认值;gridDim.x=MP的个数,比如RTX 3060含30个MP(即SM),V100含80个MP;于是,在不同的平台里,分别,gridDim.x=30或80;

设置block为一维

在 RTX 3060中,(128cuda core)/sm,每个warp在运行时会占用16个cuda core,同时每个warp有32个thread,于是可以考虑让blockDim.x=(128/16) *32=256 thread,刚好能让所有的cuda core都跑起来。

在V100中,,(64 cuda core)/sm,同上,blockDim.x=(64/16)*32=128 thread。

代码计算实现规则

什么都不计较,只考虑把结果算出来。首先,按照C在一维数组中的顺序,每个线程一次负责一个C元素的计算,C[index]。按照线性代数矩阵乘法定义,相应A矩阵的某一行的元素,分别乘以B矩阵的某一列元素,乘积累加后赋值给C[index]。

因为C,Grid,Block都是一维的,故,最初,某block中的某线程里有:

index = blockIdx.x*blockDim.x + threadIdx.x;

每个线程计算出一个C元素的值后,再去一维数组中寻找下一个自己负责的C元素。下一个元素的位置,会跨越gridDim.x*blockDim.x个一维数组元素,即

index=index+gridDim.x*blockDim.x;

至此,自我感觉良好,还是最容易理解的一个代码实现思路。

如果有更容易理解的思路,请不吝留言演示^^


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define M_MAT_A_ROW_NUM         4023 //how many rows in A
#define K_MAT_A_COLUMN_NUM      1024 //how many column in A
#define K_MAT_B_ROW_NUM         K_MAT_A_COLUMN_NUM //how many rows in B
#define N_MAT_B_COLUMN_NUM      1025 //how many column in B

#define RTX3060
#ifdef RTX3060
#define SM_NUM                  30
#define CUDA_CORE_PER_SM        128
#define CUDA_CORE_PER_WARP      16
#else 
#ifdef V100
#define SM_NUM   80
#define CUDA_CORE_PER_SM        64
#define CUDA_CORE_PER_WARP      16
#else
#define SM_NUM   40
#define CUDA_CORE_PER_SM        128
#define CUDA_CORE_PER_WARP      16
#endif
#endif
#define M_MAT_C_ROW_NUM         M_MAT_A_ROW_NUM        //how many rows in C
#define N_MAT_C_COLUMN_NUM      N_MAT_B_COLUMN_NUM     //how many column in C
#define MATRIX_GLOBAL_SIZE      (M_MAT_C_ROW_NUM * N_MAT_C_COLUMN_NUM)

void mulMatrixWithCpu(float* c, float* a, float* b);
cudaError_t mulMatrixWithCuda(float *c, float *a, float *b);

//C = aAB+bC
__global__ void mulKernel(float* c, float* a, float* b)
{
    int i = 0;
    int j = 0;
    for(int index = blockIdx.x * blockDim.x + threadIdx.x;index < MATRIX_GLOBAL_SIZE;index+=gridDim.x*blockDim.x)
    {
        i = index / N_MAT_C_COLUMN_NUM;
        j = index % N_MAT_C_COLUMN_NUM;
        for (int k = 0; k < K_MAT_A_COLUMN_NUM; k++)
        {
            c[index] += a[i + k * M_MAT_A_ROW_NUM] * b[K_MAT_A_COLUMN_NUM * j + k];
        }
    }
}


int main()
{
    float* a = (float*)malloc(M_MAT_A_ROW_NUM * K_MAT_A_COLUMN_NUM*sizeof(float));
    float* b = (float*)malloc(K_MAT_B_ROW_NUM * N_MAT_B_COLUMN_NUM*sizeof(float));
    float* c_gpu_result = (float*)malloc(MATRIX_GLOBAL_SIZE*sizeof(float));
    float* c_cpu_result = (float*)malloc(MATRIX_GLOBAL_SIZE*sizeof(float));

    srand((unsigned)time(NULL));
    for(int i=0;i< MATRIX_GLOBAL_SIZE;i++)
    {
        c_gpu_result[i] = 0.0;
        c_cpu_result[i] = 0.0;
    }
    for (int i = 0; i < M_MAT_A_ROW_NUM * K_MAT_A_COLUMN_NUM; i++)
        a[i] = (rand() % 3)/3.0;// (111.1f / (float)i) % 2.4f;
    for (int i = 0; i < K_MAT_B_ROW_NUM * N_MAT_B_COLUMN_NUM; i++)
        b[i] = (rand() % 3)/3.0;// (i / 111.0) % 3.0;
    
    mulMatrixWithCpu(c_cpu_result, a, b);

    // Multiply matrix in parallel.
    cudaError_t cudaStatus = mulMatrixWithCuda(c_gpu_result, a, b);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }
    printf("Sample results:\n");
    for(int i=0;i<7;i++)
        printf("cpu[%d]: %5f,gpu[%d]:%5f\n",i, c_cpu_result[i],i, c_gpu_result[i]);

    printf("\n\nStarting validation the result...\n ");
    for (int i = 0; i < MATRIX_GLOBAL_SIZE; i++)
        if (c_cpu_result[i] - c_gpu_result[i] > 0.01 || c_gpu_result[i] - c_cpu_result[i] > 0.01)
            printf("c[%d] is not correct\n",i);
    printf("Validation is completed.\n ");
    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }

    return 0;
}
//A: column major; B: column major; C: row major;
void mulMatrixWithCpu(float* c, float* a, float* b)
{
    int i_x = 0, i_y = 0;

    for (int i = 0;i < M_MAT_A_ROW_NUM * N_MAT_B_COLUMN_NUM;i++)
    {
        i_x = i / N_MAT_C_COLUMN_NUM;  //i_x line of A,
        i_y = i % N_MAT_C_COLUMN_NUM;  //i_y column of B;
        for (int j = 0;j < K_MAT_A_COLUMN_NUM;j++)
        {
            c[i] += a[i_x + j * M_MAT_A_ROW_NUM] * b[j + i_y * K_MAT_B_ROW_NUM];
        }   
    }
}

__global__ void blank_warmingGPU() {}

cudaError_t mulMatrixWithCuda(float* c, float* a, float* b)
{
    float *dev_a = NULL;
    float *dev_b = NULL;
    float* dev_c = NULL;
    cudaError_t cudaStatus = cudaSuccess;

    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess){
        printf("cudaStatus_1=%d \n", cudaStatus);
        goto Error;
    }

    cudaStatus = cudaMalloc((float**)&dev_c, MATRIX_GLOBAL_SIZE * sizeof(float));
    if (cudaStatus != cudaSuccess){
        printf("cudaStatus_2=%d \n", cudaStatus);
        goto Error;
    }

    cudaStatus = cudaMalloc((float**)&dev_a, M_MAT_A_ROW_NUM * K_MAT_A_COLUMN_NUM * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        printf("cudaStatus_3=%d \n", cudaStatus);
        goto Error;
    }

    cudaStatus=cudaMalloc((float**)&dev_b,K_MAT_B_ROW_NUM*N_MAT_B_COLUMN_NUM*sizeof(float));
    if (cudaStatus != cudaSuccess) {
        printf("cudaStatus_4=%d \n", cudaStatus);
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_a, a, M_MAT_A_ROW_NUM * K_MAT_A_COLUMN_NUM * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        printf("cudaStatus_5=%d \n", cudaStatus);
        goto Error;
    }

    cudaStatus = cudaMemcpy(dev_b, b, K_MAT_B_ROW_NUM * N_MAT_B_COLUMN_NUM * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        printf("cudaStatus_6=%d \n", cudaStatus);
        goto Error;
    }

    blank_warmingGPU << <1, 1 >> > ();
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    // execute kernel
    mulKernel << <SM_NUM, 32*CUDA_CORE_PER_SM / CUDA_CORE_PER_WARP >> > (dev_c, dev_a, dev_b);//<<<30,256>>>LL::30个sm,30个block;32t/w  * (128/16)
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    float time;
    cudaEventElapsedTime(&time, start, stop);
    printf("Time_mulKernel is %f ms.\n\n", time);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess){
        printf("cudaStatus_7=%d \n", cudaStatus);
        goto Error;
    }

    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess){
        printf("cudaStatus_8=%d \n", cudaStatus);
        goto Error;
    }
        
    cudaStatus = cudaMemcpy(c, dev_c, MATRIX_GLOBAL_SIZE * sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess){
        printf("cudaStatus_9=%d \n", cudaStatus);
        goto Error;
    }

Error:
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);

    return cudaStatus;
}

运行输出如下:

当配置ABsize如下时,

#define M_MAT_A_ROW_NUM         4023 //how many rows in A
#define K_MAT_A_COLUMN_NUM      1024 //how many column in A
#define K_MAT_B_ROW_NUM         K_MAT_A_COLUMN_NUM //how many rows in B
#define N_MAT_B_COLUMN_NUM      1025 //how many column in B

Debug模式:

Release模式:

矩阵是每次按时间种子随机生成的,两次的数值结果不尽相同。

C=AB       A是4023*1024的矩阵;B是1024*1025的矩阵;C是4023*1025.

在RTX3060上,debug模式下,kernel运行时间为601.093140ms;

在release模式下,kernel的运行时间为199.364609ms,3倍的速度。

C的每个元素是由K个A中的元素,乘以 K个B中的元素,每次都要从全局内存中读进来。所以,

这个kernel在运算时,对全局内存的随机访问量为M*N*2K=2MNK  个float变量;但是矩阵的实际数据量为A的M*K,B的K*N,总数据量为  M*K + K*N=K(M+N)个float变量。当M与N近似时,多余地重复访问了几乎M倍。这造成了很大的带宽浪费。

===================================================

当配置M=N=K=4096时,这种方式的计算时间,

在release模式时为   3862.350830ms

在debug  模式时为   9491.225586ms

不考虑cache的话,为了计算C矩阵的4096x4096个元素,每个元素是由A和B的一行与一列的点积得到,给个点积需要读取4096个A的元素和4096个B的元素,总共需要从全局存储区读取(2x4096)x(4096x4096)次float变量。下一次我们用tiling方法,看看能够减少多少次对全局存储区的访问。

 

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值