C是行主序,A,B是列主序,一维化地存储在数组里,并copy到GPU的全局内存中。
一. 最容易理解的实现
一解释就能让人理解的代码实现,也是一项挺难的任务^^。
尝试实现如下:
设置grid为一维
即gridDim.y=gridDim.z=1 默认值;gridDim.x=MP的个数,比如RTX 3060含30个MP(即SM),V100含80个MP;于是,在不同的平台里,分别,gridDim.x=30或80;
设置block为一维
在 RTX 3060中,(128cuda core)/sm,每个warp在运行时会占用16个cuda core,同时每个warp有32个thread,于是可以考虑让blockDim.x=(128/16) *32=256 thread,刚好能让所有的cuda core都跑起来。
在V100中,,(64 cuda core)/sm,同上,blockDim.x=(64/16)*32=128 thread。
代码计算实现规则
什么都不计较,只考虑把结果算出来。首先,按照C在一维数组中的顺序,每个线程一次负责一个C元素的计算,C[index]。按照线性代数矩阵乘法定义,相应A矩阵的某一行的元素,分别乘以B矩阵的某一列元素,乘积累加后赋值给C[index]。
因为C,Grid,Block都是一维的,故,最初,某block中的某线程里有:
index = blockIdx.x*blockDim.x + threadIdx.x;
每个线程计算出一个C元素的值后,再去一维数组中寻找下一个自己负责的C元素。下一个元素的位置,会跨越gridDim.x*blockDim.x个一维数组元素,即
index=index+gridDim.x*blockDim.x;
至此,自我感觉良好,还是最容易理解的一个代码实现思路。
如果有更容易理解的思路,请不吝留言演示^^
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define M_MAT_A_ROW_NUM 4023 //how many rows in A
#define K_MAT_A_COLUMN_NUM 1024 //how many column in A
#define K_MAT_B_ROW_NUM K_MAT_A_COLUMN_NUM //how many rows in B
#define N_MAT_B_COLUMN_NUM 1025 //how many column in B
#define RTX3060
#ifdef RTX3060
#define SM_NUM 30
#define CUDA_CORE_PER_SM 128
#define CUDA_CORE_PER_WARP 16
#else
#ifdef V100
#define SM_NUM 80
#define CUDA_CORE_PER_SM 64
#define CUDA_CORE_PER_WARP 16
#else
#define SM_NUM 40
#define CUDA_CORE_PER_SM 128
#define CUDA_CORE_PER_WARP 16
#endif
#endif
#define M_MAT_C_ROW_NUM M_MAT_A_ROW_NUM //how many rows in C
#define N_MAT_C_COLUMN_NUM N_MAT_B_COLUMN_NUM //how many column in C
#define MATRIX_GLOBAL_SIZE (M_MAT_C_ROW_NUM * N_MAT_C_COLUMN_NUM)
void mulMatrixWithCpu(float* c, float* a, float* b);
cudaError_t mulMatrixWithCuda(float *c, float *a, float *b);
//C = aAB+bC
__global__ void mulKernel(float* c, float* a, float* b)
{
int i = 0;
int j = 0;
for(int index = blockIdx.x * blockDim.x + threadIdx.x;index < MATRIX_GLOBAL_SIZE;index+=gridDim.x*blockDim.x)
{
i = index / N_MAT_C_COLUMN_NUM;
j = index % N_MAT_C_COLUMN_NUM;
for (int k = 0; k < K_MAT_A_COLUMN_NUM; k++)
{
c[index] += a[i + k * M_MAT_A_ROW_NUM] * b[K_MAT_A_COLUMN_NUM * j + k];
}
}
}
int main()
{
float* a = (float*)malloc(M_MAT_A_ROW_NUM * K_MAT_A_COLUMN_NUM*sizeof(float));
float* b = (float*)malloc(K_MAT_B_ROW_NUM * N_MAT_B_COLUMN_NUM*sizeof(float));
float* c_gpu_result = (float*)malloc(MATRIX_GLOBAL_SIZE*sizeof(float));
float* c_cpu_result = (float*)malloc(MATRIX_GLOBAL_SIZE*sizeof(float));
srand((unsigned)time(NULL));
for(int i=0;i< MATRIX_GLOBAL_SIZE;i++)
{
c_gpu_result[i] = 0.0;
c_cpu_result[i] = 0.0;
}
for (int i = 0; i < M_MAT_A_ROW_NUM * K_MAT_A_COLUMN_NUM; i++)
a[i] = (rand() % 3)/3.0;// (111.1f / (float)i) % 2.4f;
for (int i = 0; i < K_MAT_B_ROW_NUM * N_MAT_B_COLUMN_NUM; i++)
b[i] = (rand() % 3)/3.0;// (i / 111.0) % 3.0;
mulMatrixWithCpu(c_cpu_result, a, b);
// Multiply matrix in parallel.
cudaError_t cudaStatus = mulMatrixWithCuda(c_gpu_result, a, b);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
printf("Sample results:\n");
for(int i=0;i<7;i++)
printf("cpu[%d]: %5f,gpu[%d]:%5f\n",i, c_cpu_result[i],i, c_gpu_result[i]);
printf("\n\nStarting validation the result...\n ");
for (int i = 0; i < MATRIX_GLOBAL_SIZE; i++)
if (c_cpu_result[i] - c_gpu_result[i] > 0.01 || c_gpu_result[i] - c_cpu_result[i] > 0.01)
printf("c[%d] is not correct\n",i);
printf("Validation is completed.\n ");
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
//A: column major; B: column major; C: row major;
void mulMatrixWithCpu(float* c, float* a, float* b)
{
int i_x = 0, i_y = 0;
for (int i = 0;i < M_MAT_A_ROW_NUM * N_MAT_B_COLUMN_NUM;i++)
{
i_x = i / N_MAT_C_COLUMN_NUM; //i_x line of A,
i_y = i % N_MAT_C_COLUMN_NUM; //i_y column of B;
for (int j = 0;j < K_MAT_A_COLUMN_NUM;j++)
{
c[i] += a[i_x + j * M_MAT_A_ROW_NUM] * b[j + i_y * K_MAT_B_ROW_NUM];
}
}
}
__global__ void blank_warmingGPU() {}
cudaError_t mulMatrixWithCuda(float* c, float* a, float* b)
{
float *dev_a = NULL;
float *dev_b = NULL;
float* dev_c = NULL;
cudaError_t cudaStatus = cudaSuccess;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess){
printf("cudaStatus_1=%d \n", cudaStatus);
goto Error;
}
cudaStatus = cudaMalloc((float**)&dev_c, MATRIX_GLOBAL_SIZE * sizeof(float));
if (cudaStatus != cudaSuccess){
printf("cudaStatus_2=%d \n", cudaStatus);
goto Error;
}
cudaStatus = cudaMalloc((float**)&dev_a, M_MAT_A_ROW_NUM * K_MAT_A_COLUMN_NUM * sizeof(float));
if (cudaStatus != cudaSuccess) {
printf("cudaStatus_3=%d \n", cudaStatus);
goto Error;
}
cudaStatus=cudaMalloc((float**)&dev_b,K_MAT_B_ROW_NUM*N_MAT_B_COLUMN_NUM*sizeof(float));
if (cudaStatus != cudaSuccess) {
printf("cudaStatus_4=%d \n", cudaStatus);
goto Error;
}
cudaStatus = cudaMemcpy(dev_a, a, M_MAT_A_ROW_NUM * K_MAT_A_COLUMN_NUM * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
printf("cudaStatus_5=%d \n", cudaStatus);
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, K_MAT_B_ROW_NUM * N_MAT_B_COLUMN_NUM * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
printf("cudaStatus_6=%d \n", cudaStatus);
goto Error;
}
blank_warmingGPU << <1, 1 >> > ();
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
// execute kernel
mulKernel << <SM_NUM, 32*CUDA_CORE_PER_SM / CUDA_CORE_PER_WARP >> > (dev_c, dev_a, dev_b);//<<<30,256>>>LL::30个sm,30个block;32t/w * (128/16)
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float time;
cudaEventElapsedTime(&time, start, stop);
printf("Time_mulKernel is %f ms.\n\n", time);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess){
printf("cudaStatus_7=%d \n", cudaStatus);
goto Error;
}
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess){
printf("cudaStatus_8=%d \n", cudaStatus);
goto Error;
}
cudaStatus = cudaMemcpy(c, dev_c, MATRIX_GLOBAL_SIZE * sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess){
printf("cudaStatus_9=%d \n", cudaStatus);
goto Error;
}
Error:
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return cudaStatus;
}
运行输出如下:
当配置ABsize如下时,
#define M_MAT_A_ROW_NUM 4023 //how many rows in A
#define K_MAT_A_COLUMN_NUM 1024 //how many column in A
#define K_MAT_B_ROW_NUM K_MAT_A_COLUMN_NUM //how many rows in B
#define N_MAT_B_COLUMN_NUM 1025 //how many column in B
Debug模式:
Release模式:
矩阵是每次按时间种子随机生成的,两次的数值结果不尽相同。
C=AB A是4023*1024的矩阵;B是1024*1025的矩阵;C是4023*1025.
在RTX3060上,debug模式下,kernel运行时间为601.093140ms;
在release模式下,kernel的运行时间为199.364609ms,3倍的速度。
C的每个元素是由K个A中的元素,乘以 K个B中的元素,每次都要从全局内存中读进来。所以,
这个kernel在运算时,对全局内存的随机访问量为M*N*2K=2MNK 个float变量;但是矩阵的实际数据量为A的M*K,B的K*N,总数据量为 M*K + K*N=K(M+N)个float变量。当M与N近似时,多余地重复访问了几乎M倍。这造成了很大的带宽浪费。
===================================================
当配置M=N=K=4096时,这种方式的计算时间,
在release模式时为 3862.350830ms
在debug 模式时为 9491.225586ms
不考虑cache的话,为了计算C矩阵的4096x4096个元素,每个元素是由A和B的一行与一列的点积得到,给个点积需要读取4096个A的元素和4096个B的元素,总共需要从全局存储区读取(2x4096)x(4096x4096)次float变量。下一次我们用tiling方法,看看能够减少多少次对全局存储区的访问。