CUDA/GPU下矩阵乘法的几种实现的C++源码

最新推荐文章于 2024-09-24 22:30:00 发布

mathsoperator

最新推荐文章于 2024-09-24 22:30:00 发布

阅读量1.3w

点赞数 3

分类专栏： C/C++/C#/Fortran 文章标签： c++ float timer c cuda 测试

本文链接：https://blog.csdn.net/mathsoperator/article/details/6125087

版权

C/C++/C#/Fortran 专栏收录该内容

24 篇文章

订阅专栏

本文介绍了一种利用CUDA进行矩阵乘法的优化方法，并详细展示了五种不同的实现方式及其性能对比，包括简单方法、块方法、块加循环展开方法、块加线程粒度以及块加循环展开加线程粒度。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

#include "cutil_inline.h" #include "cublas.h" void MatrixMul(const float *A, const float *B, float *C, int Width) { int i, j, k; for(i=0; i<Width; i++) for(j=0; j<Width; j++){ float s=0; for(k=0; k<Width; k++) s+=A[i*Width+k]*B[k*Width+j]; C[i*Width+j]=s; } } #define TILE_WIDTH 16 //简单方法 __global__ void MatrixMulKernel1(float* Md, float* Nd, float* Pd, int Width) { int x = threadIdx.x+blockIdx.x*blockDim.x; int y = threadIdx.y+blockIdx.y*blockDim.y; float Pvalue = 0; for (int k = 0; k < Width; ++k) Pvalue+=Md[y * Width + k]*Nd[k * Width + x]; Pd[y*Width + x] = Pvalue; } //块方法 __global__ void MatrixMulKernel2(float* Md, float* Nd, float* Pd, int Width) { int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; float Pvalue = 0; for (int m = 0; m < gridDim.x; ++m) { __shared__ float Mds[TILE_WIDTH][TILE_WIDTH]; __shared__ float Nds[TILE_WIDTH][TILE_WIDTH]; Mds[ty][tx] = *(Md + (by*blockDim.y + ty) * Width + m*blockDim.x + tx); Nds[ty][tx] = *(Nd + (m*blockDim.y + ty) * Width + bx*blockDim.x + tx); __syncthreads(); for (int k = 0; k < blockDim.x; ++k) Pvalue += Mds[ty][k] * Nds[k][tx]; __syncthreads(); } Pd[(by*blockDim.y+ty)*Width+bx*blockDim.x+tx] = Pvalue; } //块方法, 循环展开 __global__ void MatrixMulKernel3(float* Md, float* Nd, float* Pd, int Width) { int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; float Pvalue = 0; for (int m = 0; m < gridDim.x; ++m) { __shared__ float Mds[TILE_WIDTH][TILE_WIDTH]; __shared__ float Nds[TILE_WIDTH][TILE_WIDTH]; Mds[ty][tx] = *(Md + (by*blockDim.y + ty) * Width + m*blockDim.x + tx); Nds[ty][tx] = *(Nd + (m*blockDim.y + ty) * Width + bx*blockDim.x + tx); __syncthreads(); Pvalue += Mds[ty][0] * Nds[0][tx] + Mds[ty][1] * Nds[1][tx] + Mds[ty][2] * Nds[2][tx] + Mds[ty][3] * Nds[3][tx] + Mds[ty][4] * Nds[4][tx] + Mds[ty][5] * Nds[5][tx] + Mds[ty][6] * Nds[6][tx] + Mds[ty][7] * Nds[7][tx] + Mds[ty][8] * Nds[8][tx] + Mds[ty][9] * Nds[9][tx] + Mds[ty][10] * Nds[10][tx] + Mds[ty][11] * Nds[11][tx] + Mds[ty][12] * Nds[12][tx] + Mds[ty][13] * Nds[13][tx] + Mds[ty][14] * Nds[14][tx] + Mds[ty][15] * Nds[15][tx]; __syncthreads(); } Pd[(by*blockDim.y+ty)*Width+bx*blockDim.x+tx] = Pvalue; } #define ThreadColumn 2 //块方法, 线程粒度 __global__ void MatrixMulKernel4(float* Md, float* Nd, float* Pd, int Width) { int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; float Pvalue1 = 0, Pvalue2=0; for (int m = 0; m < gridDim.x; ++m) { __shared__ float Mds[TILE_WIDTH][ThreadColumn*TILE_WIDTH]; __shared__ float Nds[ThreadColumn*TILE_WIDTH][ThreadColumn*TILE_WIDTH]; for(int k=0;k<ThreadColumn; ++k) Mds[ty][tx+k*TILE_WIDTH] = *(Md + (by*blockDim.y + ty) * Width + ThreadColumn*m*blockDim.x + tx + k*TILE_WIDTH ); for(int h=0;h<ThreadColumn; ++h) for(int k=0; k<ThreadColumn; ++k) Nds[ty+h*TILE_WIDTH][tx+k*TILE_WIDTH] = *(Nd + (m*blockDim.y*ThreadColumn + ty + h*TILE_WIDTH) * Width + bx*blockDim.x*ThreadColumn + tx + k*TILE_WIDTH); __syncthreads(); for(int h=0;h<ThreadColumn*TILE_WIDTH;++h) Pvalue1 +=Mds[ty][h] * Nds[h][tx]; for(int k=0;k<ThreadColumn*TILE_WIDTH;++k) Pvalue2 +=Mds[ty][k] * Nds[k][tx+TILE_WIDTH]; __syncthreads(); } Pd[(by*blockDim.y + ty) * Width + bx*ThreadColumn*blockDim.x + tx] = Pvalue1; Pd[(by*blockDim.y + ty) * Width + bx*ThreadColumn*blockDim.x + tx + TILE_WIDTH] = Pvalue2; } //块方法, 循环展开, 线程粒度 __global__ void MatrixMulKernel5(float* Md, float* Nd, float* Pd, int Width) { int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; float Pvalue1 = 0, Pvalue2=0; for (int m = 0; m < gridDim.x; ++m) { __shared__ float Mds[TILE_WIDTH][ThreadColumn*TILE_WIDTH]; __shared__ float Nds[ThreadColumn*TILE_WIDTH][ThreadColumn*TILE_WIDTH]; for(int k=0;k<ThreadColumn; ++k) Mds[ty][tx+k*TILE_WIDTH] = *(Md + (by*blockDim.y + ty) * Width + ThreadColumn*m*blockDim.x + tx + k*TILE_WIDTH ); for(int h=0;h<ThreadColumn; ++h) for(int k=0; k<ThreadColumn; ++k) Nds[ty+h*TILE_WIDTH][tx+k*TILE_WIDTH] = *(Nd + (m*blockDim.y*ThreadColumn + ty + h*TILE_WIDTH) * Width + bx*blockDim.x*ThreadColumn + tx + k*TILE_WIDTH); __syncthreads(); Pvalue1 += Mds[ty][0] * Nds[0][tx] + Mds[ty][1] * Nds[1][tx] + Mds[ty][2] * Nds[2][tx] + Mds[ty][3] * Nds[3][tx] + Mds[ty][4] * Nds[4][tx] + Mds[ty][5] * Nds[5][tx] + Mds[ty][6] * Nds[6][tx] + Mds[ty][7] * Nds[7][tx] + Mds[ty][8] * Nds[8][tx] + Mds[ty][9] * Nds[9][tx] + Mds[ty][10] * Nds[10][tx] + Mds[ty][11] * Nds[11][tx] + Mds[ty][12] * Nds[12][tx] + Mds[ty][13] * Nds[13][tx] + Mds[ty][14] * Nds[14][tx] + Mds[ty][15] * Nds[15][tx] + Mds[ty][16] * Nds[16][tx] + Mds[ty][17] * Nds[17][tx] + Mds[ty][18] * Nds[18][tx] + Mds[ty][19] * Nds[19][tx] + Mds[ty][20] * Nds[20][tx] + Mds[ty][21] * Nds[21][tx] + Mds[ty][22] * Nds[22][tx] + Mds[ty][23] * Nds[23][tx] + Mds[ty][24] * Nds[24][tx] + Mds[ty][25] * Nds[25][tx] + Mds[ty][26] * Nds[26][tx] + Mds[ty][27] * Nds[27][tx] + Mds[ty][28] * Nds[28][tx] + Mds[ty][29] * Nds[29][tx] + Mds[ty][30] * Nds[30][tx] + Mds[ty][31] * Nds[31][tx]; Pvalue2 += Mds[ty][0] * Nds[0][tx+TILE_WIDTH] + Mds[ty][1] * Nds[1][tx+TILE_WIDTH] + Mds[ty][2] * Nds[2][tx+TILE_WIDTH] + Mds[ty][3] * Nds[3][tx+TILE_WIDTH] + Mds[ty][4] * Nds[4][tx+TILE_WIDTH] + Mds[ty][5] * Nds[5][tx+TILE_WIDTH] + Mds[ty][6] * Nds[6][tx+TILE_WIDTH] + Mds[ty][7] * Nds[7][tx+TILE_WIDTH] + Mds[ty][8] * Nds[8][tx+TILE_WIDTH] + Mds[ty][9] * Nds[9][tx+TILE_WIDTH] + Mds[ty][10] * Nds[10][tx+TILE_WIDTH] + Mds[ty][11] * Nds[11][tx+TILE_WIDTH] + Mds[ty][12] * Nds[12][tx+TILE_WIDTH] + Mds[ty][13] * Nds[13][tx+TILE_WIDTH] + Mds[ty][14] * Nds[14][tx+TILE_WIDTH] + Mds[ty][15] * Nds[15][tx+TILE_WIDTH] + Mds[ty][16] * Nds[16][tx+TILE_WIDTH] + Mds[ty][17] * Nds[17][tx+TILE_WIDTH] + Mds[ty][18] * Nds[18][tx+TILE_WIDTH] + Mds[ty][19] * Nds[19][tx+TILE_WIDTH] + Mds[ty][20] * Nds[20][tx+TILE_WIDTH] + Mds[ty][21] * Nds[21][tx+TILE_WIDTH] + Mds[ty][22] * Nds[22][tx+TILE_WIDTH] + Mds[ty][23] * Nds[23][tx+TILE_WIDTH] + Mds[ty][24] * Nds[24][tx+TILE_WIDTH] + Mds[ty][25] * Nds[25][tx+TILE_WIDTH] + Mds[ty][26] * Nds[26][tx+TILE_WIDTH] + Mds[ty][27] * Nds[27][tx+TILE_WIDTH] + Mds[ty][28] * Nds[28][tx+TILE_WIDTH] + Mds[ty][29] * Nds[29][tx+TILE_WIDTH] + Mds[ty][30] * Nds[30][tx+TILE_WIDTH] + Mds[ty][31] * Nds[31][tx+TILE_WIDTH]; __syncthreads(); } Pd[(by*blockDim.y + ty) * Width + bx*ThreadColumn*blockDim.x + tx] = Pvalue1; Pd[(by*blockDim.y + ty) * Width + bx*ThreadColumn*blockDim.x + tx + TILE_WIDTH] = Pvalue2; } int main(int argc, char **argv) { int Width(0); bool is_test=false; if(argc==1){ Width=512; is_test=false; } else if(argc==2){ Width=atoi(argv[1]); is_test=false; } else{ Width=atoi(argv[1]); is_test=(bool)atoi(argv[2]); } float *h_A=(float*)malloc(Width*Width*sizeof(float)); float *h_B=(float*)malloc(Width*Width*sizeof(float)); float *h_C=(float*)malloc(Width*Width*sizeof(float)); float *h_C_ref=(float*)malloc(Width*Width*sizeof(float)); float *d_A, *d_B, *d_C; float t0, error_norm=0, ref_norm=0; unsigned int timer1=0; cutCreateTimer(&timer1); cutStartTimer(timer1); for(int i=0; i<Width*Width; i++) { h_A[i]=rand()/(float)RAND_MAX; h_B[i]=rand()/(float)RAND_MAX; } cublasInit(); cublasAlloc(Width*Width, sizeof(float), (void**)&d_A); cublasAlloc(Width*Width, sizeof(float), (void**)&d_B); cublasAlloc(Width*Width, sizeof(float), (void**)&d_C); cublasSetVector(Width*Width, sizeof(float), h_A, 1, d_A, 1); cublasSetVector(Width*Width, sizeof(float), h_B, 1, d_B, 1); if(is_test) MatrixMul(h_A, h_B, h_C_ref,Width); dim3 dimBlock(TILE_WIDTH, TILE_WIDTH); dim3 dimGrid(Width/TILE_WIDTH, Width/TILE_WIDTH); t0=cutGetTimerValue(timer1); cudaThreadSynchronize(); MatrixMulKernel1<<<dimGrid,dimBlock>>>(d_A,d_B,d_C,Width); cudaThreadSynchronize(); float gpu_t1=(cutGetTimerValue(timer1)-t0)/1000.0f; cublasGetVector(Width*Width, sizeof(float), d_C, 1, h_C, 1); if(is_test){ error_norm=0; ref_norm=0; for(int i=0; i<Width*Width; i++) { float diff=h_C_ref[i]-h_C[i]; error_norm+=diff*diff; ref_norm+=h_C_ref[i]*h_C_ref[i]; } printf("Test %s/n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED"); } t0=cutGetTimerValue(timer1); cudaThreadSynchronize(); MatrixMulKernel2<<<dimGrid,dimBlock>>>(d_A,d_B,d_C,Width); cudaThreadSynchronize(); float gpu_t2=(cutGetTimerValue(timer1)-t0)/1000.0f; cublasGetVector(Width*Width, sizeof(float), d_C, 1, h_C, 1); if(is_test){ error_norm=0; ref_norm=0; for(int i=0; i<Width*Width; i++) { float diff=h_C_ref[i]-h_C[i]; error_norm+=diff*diff; ref_norm+=h_C_ref[i]*h_C_ref[i]; } printf("Test %s/n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED"); } t0=cutGetTimerValue(timer1); cudaThreadSynchronize(); MatrixMulKernel3<<<dimGrid,dimBlock>>>(d_A,d_B,d_C,Width); cudaThreadSynchronize(); float gpu_t3=(cutGetTimerValue(timer1)-t0)/1000.0f; cublasGetVector(Width*Width, sizeof(float), d_C, 1, h_C, 1); if(is_test){ error_norm=0; ref_norm=0; for(int i=0; i<Width*Width; i++) { float diff=h_C_ref[i]-h_C[i]; error_norm+=diff*diff; ref_norm+=h_C_ref[i]*h_C_ref[i]; } printf("Test %s/n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED"); } dimGrid.x/=ThreadColumn; t0=cutGetTimerValue(timer1); cudaThreadSynchronize(); MatrixMulKernel4<<<dimGrid,dimBlock>>>(d_A,d_B,d_C,Width); cudaThreadSynchronize(); float gpu_t4=(cutGetTimerValue(timer1)-t0)/1000.0f; cublasGetVector(Width*Width, sizeof(float), d_C, 1, h_C, 1); if(is_test){ error_norm=0; ref_norm=0; for(int i=0; i<Width*Width; i++) { float diff=h_C_ref[i]-h_C[i]; error_norm+=diff*diff; ref_norm+=h_C_ref[i]*h_C_ref[i]; } printf("Test %s/n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED"); } t0=cutGetTimerValue(timer1); cudaThreadSynchronize(); MatrixMulKernel5<<<dimGrid,dimBlock>>>(d_A,d_B,d_C,Width); cudaThreadSynchronize(); float gpu_t5=(cutGetTimerValue(timer1)-t0)/1000.0f; cublasGetVector(Width*Width, sizeof(float), d_C, 1, h_C, 1); if(is_test){ error_norm=0; ref_norm=0; for(int i=0; i<Width*Width; i++) { float diff=h_C_ref[i]-h_C[i]; error_norm+=diff*diff; ref_norm+=h_C_ref[i]*h_C_ref[i]; } printf("Test %s/n", (sqrtf(error_norm/ref_norm)<1E-6) ? "PASSED" : "FAILED"); } printf("矩阵阶数为%4d,", Width); printf("简单方法: %.6fs(%.3fGflops),", gpu_t1, 1e-9*Width*Width*Width*2/gpu_t1); printf("块方法: %.6fs(%.3fGflops),", gpu_t2, 1e-9*Width*Width*Width*2/gpu_t2); printf("块+循环展开方法: %.6fs(%.3fGflops),", gpu_t3, 1e-9*Width*Width*Width*2/gpu_t3); printf("块+线程粒度2: %.6fs(%.3fGflops),", gpu_t4, 1e-9*Width*Width*Width*2/gpu_t4); printf("块+循环展开方法+线程粒度2: %.6fs(%.3fGflops)/n", gpu_t5, 1e-9*Width*Width*Width*2/gpu_t5); }

环境：CUDA toolkit3.2+Windows XP+CUDA SDK中的vs2008模板release编译通过，显卡是GeForce GT240。大家可根据自己的情况进行测试，报告一下结果吧.

不同规模的运行结果:
矩阵阶数为 512,简单方法: 0.033887s(7.922Gflops),块方法: 0.008424s(31.865Gflops),块+循环展开方法: 0.003995s(67.191Gflops),块+线程粒度2: 0.008091s(33.179Gflops),块+循环展开方法+线程粒度2: 0.003399s(78.984Gflops)

矩阵阶数为1024,简单方法: 0.264424s(8.121Gflops),块方法: 0.066201s(32.439Gflops),块+循环展开方法: 0.030827s(69.662Gflops),块+线程粒度2: 0.063264s(33.945Gflops),块+循环展开方法+线程粒度2: 0.026282s(81.710Gflops)

矩阵阶数为2048,简单方法: 2.112513s(8.132Gflops),块方法: 0.527002s(32.599Gflops),块+循环展开方法: 0.244058s(70.392Gflops),块+线程粒度2: 0.505495s(33.986Gflops),块+循环展开方法+线程粒度2: 0.208845s(82.261Gflops)

矩阵阶数为4096,简单方法: 17.705070s(7.763Gflops),块方法: 4.205308s(32.682Gflops),块+循环展开方法: 1.966043s(69.906Gflops),块+线程粒度2: 4.059064s(33.860Gflops),块+循环展开方法+线程粒度2: 1.677771s(81.918Gflops)

测试结果表明在块大小16x16时，块方法和循环展开对速度有很显著的影响，而线程粒度的使用对速度只有很小的提高。而块大小是8x8时的情形没有测试，线程粒度对速度应该有较大影响，家可以做一做。

参考文献: David B. Kirk, Wen-mei W. Hwu. 大规模并行处理器编程实战[M]. 北京: 清华大学出版社, 2010.