【CUDA并行编程之四】矩阵相乘

最新推荐文章于 2023-08-05 21:16:36 发布

zbxzc

最新推荐文章于 2023-08-05 21:16:36 发布

阅读量744

点赞数

分类专栏： CUDA 文章标签： cuda

CUDA 专栏收录该内容

18 篇文章 0 订阅

订阅专栏

前面介绍了基本的Cuda编程的相关知识，那么这一篇在此基础之上来看看GPU在处理数据计算上的高效能，我们拿矩阵相乘来作为例子。

1.CPU上执行矩阵相乘以及性能。

在CPU上进行矩阵相乘运算的代码：

mat_mul.cc:

[cpp]view plaincopy 
   
 //a[i]*b[i] + c[i] = d[i]  
 #include<iostream>  
 #include<vector>  
 #include<map>  
 #include<fstream>  
 #include"wtime.h"   
   
 using namespace std;  
   
 const int N = 320;  
   
 //矩阵有两种表达的方法用二维矩阵或者用一维矩阵表示  
 int a[N+1][N+1],b[N+1][N+1],c[N+1][N+1],d[N+1][N+1];  
 int aa[(N+1)*(N+1)],bb[(N+1)*(N+1)],cc[(N+1)*(N+1)],dd[(N+1)*(N+1)];  
   
 void init()  
 {  
     for(int i=0;i<N;i++)  
         for(int j=0;j<N;j++)  
         {  
             a[i][j] = 1;  
             b[i][j] = 2;  
             c[i][j] = 3;  
         }  
 }  
   
 void init1()  
 {  
     for(int i=0;i<N;i++)  
         for(int j=0;j<N;j++)  
         {  
             aa[i*N+j] = 1;  
             bb[i*N+j] = 2;  
             cc[i*N+j] = 3;  
         }  
 }  
   
 void mul()  
 {  
     for(int i=0;i<N;i++)   
       for(int j=0;j<N;j++)  
       {  
         for(int k=0;k<N;k++)  
         {  
             d[i][j] += a[i][k] * b[k][j];  
         }  
         d[i][j] += c[i][j];  
       }  
 }  
   
 void mul1()  
 {  
     for(int i=0;i<N;i++)   
       for(int j=0;j<N;j++)  
       {  
         for(int k=0;k<N;k++)  
         {  
             dd[i*N+j] += aa[i*N+k] * bb[k*N+j];  
         }  
         dd[N*i+j] += cc[N*i+j];  
       }  
 }  
   
 void print()  
 {  
     ofstream fout;  
     fout.open("result.txt");  
     if(!fout)  
     {  
         perror("can not open the file");  
     }  
     for(int i=0;i<N;i++)  
     {  
       for(int j=0;j<N;j++)  
       {  
           fout<<d[i][j]<<" ";  
       }  
       fout<<endl;  
     }  
     fout.close();  
 }  
   
 int main()  
 {  
     init1();      
       
     double t = wtime();  
     mul1();  
     t = wtime()-t;  
     printf("computation timing = %10.10f sec\n",t);  
       
     //print();  
   
     return 0;  
 }  

wtime.h:

[cpp]view plaincopy 
   
 #ifndef _WTIME_  
 #define _WTIME_  
   
 double wtime();  
   
 #endif  

wtime.cc:

[cpp]view plaincopy 
   
 #include <stdio.h>  
 #include <sys/time.h>  
 #include <iostream>  
 #include <cstdlib>  
   
 double wtime(void)  
 {  
     double now_time;  
     struct timeval etstart;  
     struct timezone tzp;  
   
     if(gettimeofday(&etstart,&tzp)==-1)  
     {  
         perror("Error:calling gettimeofday() not successfully.\n");  
     }  
   
     now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;  
   
     return now_time;  
 }  
   
 #if 0  
 int main()  
 {  
     double time;  
     time = wtime();  
   
     printf("time of day = %10.4f\n",time);  
   
     return 0;  
 }  
 #endif  

makefile:

[cpp]view plaincopy 
   
 target:  
     g++ mat_mul.cc wtime.cc  
     ./a.out  

结果：

2.GPU上执行矩阵相乘以及性能。

代码：

cuda_mat_mul_v1.cu:

[cpp]view plaincopy 
   
 //matrix multiplication with global memory   
 #include<iostream>  
 #include<fstream>  
 #include "wtime.h"  
   
 using namespace std;  
   
   
 const int BLOCK_SIZE = 16;  
 const int GRID_SIZE = 20;  
   
 //D = A * B + C;  
 __global__ void mat_mul(int *da,int *db,int *dc,int *dd,int N)  
 {  
     int row = blockIdx.y * blockDim.y + threadIdx.y;  
     int col = blockIdx.x * blockDim.x + threadIdx.x;  
   
     int sum = 0;  
     for(int i=0;i<N;i++)  
     {  
         sum += da[row*N + i] * db[row*i+col];  
     }  
     dd[row*N + col] = sum + dc[row*N + col];  
 }  
   
 int main()  
 {  
     int N = BLOCK_SIZE * GRID_SIZE;  
     int *ha,*hb,*hc,*hd;  
     int *da,*db,*dc,*dd;  
     double time;  
     ha = new int[N*N];  
     hb = new int[N*N];  
     hc = new int[N*N];  
     hd = new int[N*N];  
     cudaError_t err;  
   
     //initialize  
     for(int i=0;i<N;i++)  
         for(int j=0;j<N;j++)  
         {  
             ha[i*N+j] = 1;  
             hb[i*N+j] = 2;  
             hc[i*N+j] = 3;  
         }  
   
     //malloc</strong>  
     cudaMalloc(&da,N*N*sizeof(int));  
     cudaMalloc(&db,N*N*sizeof(int));  
     cudaMalloc(&dc,N*N*sizeof(int));  
     err = cudaMalloc(&dd,N*N*sizeof(int));  
     printf("Cuda Malloc C : %s\n",cudaGetErrorString(err));  
   
     //host to device  
     cudaMemcpy(da,ha,N*N*sizeof(int),cudaMemcpyHostToDevice);  
     cudaMemcpy(db,hb,N*N*sizeof(int),cudaMemcpyHostToDevice);  
     cudaMemcpy(dc,hc,N*N*sizeof(int),cudaMemcpyHostToDevice);  
     cudaMemcpy(dd,hd,N*N*sizeof(int),cudaMemcpyHostToDevice);  
   
     dim3 threadBlock(BLOCK_SIZE,BLOCK_SIZE);  
     dim3 grid(GRID_SIZE,GRID_SIZE);  
     //kernel  
     time = wtime();  
     mat_mul<<<grid,threadBlock>>>(da,db,dc,dd,N);  
     printf("Computation time is %10.10f\n",wtime()-time);  
   
     //device to host  
     cudaMemcpy(hd,dd,N*N*sizeof(int),cudaMemcpyDeviceToHost);  
   
     //print result to file  
     ofstream fout;  
     fout.open("result_v1.txt");  
     if(!fout)    
     {  
         cerr<<"open the file error"<<endl;  
         exit(-1);  
     }  
     for(int i=0;i<N;i++)   
     {  
         for(int j=0;j<N;j++)  
         {  
             fout<<hd[i*N+j]<<" ";  
         }  
         fout<<endl;  
     }  
       
     delete []ha;delete []hb;delete []hc;delete []hd;  
     cudaFree(da);cudaFree(db);cudaFree(dc);cudaFree(dd);  
   
     return 0;  
 }  

cuda_wtime.cu:

[cpp]view plaincopy 
   
 #include <stdio.h>  
 #include <sys/time.h>  
 #include <iostream>  
 #include <cstdlib>  
   
 double wtime(void)  
 {  
     double now_time;  
     struct timeval etstart;  
     struct timezone tzp;  
   
     if(gettimeofday(&etstart,&tzp)==-1)  
     {  
         perror("Error:calling gettimeofday() not successfully.\n");  
     }  
   
     now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;  
   
     return now_time;  
 }  
   
 #if 0  
 int main()  
 {  
     double time;  
     time = wtime();  
   
     printf("time of day = %10.4f\n",time);  
   
     return 0;  
 }  
 #endif  

wtime.h:

[cpp]view plaincopy 
   
 #ifndef _WTIME_  
 #define _WTIME_  
   
 double wtime();  
   
 #endif  

cuda_wtime.cu:

[cpp]view plaincopy 
   
 #include <stdio.h>  
 #include <sys/time.h>  
 #include <iostream>  
 #include <cstdlib>  
   
 double wtime(void)  
 {  
     double now_time;  
     struct timeval etstart;  
     struct timezone tzp;  
   
     if(gettimeofday(&etstart,&tzp)==-1)  
     {  
         perror("Error:calling gettimeofday() not successfully.\n");  
     }  
   
     now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;  
   
     return now_time;  
 }  
   
 #if 0  
 int main()  
 {  
     double time;  
     time = wtime();  
   
     printf("time of day = %10.4f\n",time);  
   
     return 0;  
 }  
 #endif  

makefile:

[cpp]view plaincopy 
   
 cu:  
     nvcc cuda_mat_mul_v1.cu cuda_wtime.cu  
     ./a.out  

结果：

3.计算性能对比：

矩阵大小	*16001600**	1200*1200	800*800	320*320
串行时间/s	30.9	11.49865	2.597987	0.162311
并行时间	grid=100/block=16	grid=75/block=16	grid=50/block=16	grid=20/block=16
kernel执行时间/s	0.0000319	0.0000309944	0.0000309944	0.0000231266
并行计算总时间（分配内存加+数据拷贝+计算）/s	0.70796	0.439213	0.310214	0.237676