前面介绍了基本的Cuda编程的相关知识,那么这一篇在此基础之上来看看GPU在处理数据计算上的高效能,我们拿矩阵相乘来作为例子。
1.CPU上执行矩阵相乘以及性能。
在CPU上进行矩阵相乘运算的代码:
mat_mul.cc:
- //a[i]*b[i] + c[i] = d[i]
- #include<iostream>
- #include<vector>
- #include<map>
- #include<fstream>
- #include"wtime.h"
- using namespace std;
- const int N = 320;
- //矩阵有两种表达的方法用二维矩阵或者用一维矩阵表示
- int a[N+1][N+1],b[N+1][N+1],c[N+1][N+1],d[N+1][N+1];
- int aa[(N+1)*(N+1)],bb[(N+1)*(N+1)],cc[(N+1)*(N+1)],dd[(N+1)*(N+1)];
- void init()
- {
- for(int i=0;i<N;i++)
- for(int j=0;j<N;j++)
- {
- a[i][j] = 1;
- b[i][j] = 2;
- c[i][j] = 3;
- }
- }
- void init1()
- {
- for(int i=0;i<N;i++)
- for(int j=0;j<N;j++)
- {
- aa[i*N+j] = 1;
- bb[i*N+j] = 2;
- cc[i*N+j] = 3;
- }
- }
- void mul()
- {
- for(int i=0;i<N;i++)
- for(int j=0;j<N;j++)
- {
- for(int k=0;k<N;k++)
- {
- d[i][j] += a[i][k] * b[k][j];
- }
- d[i][j] += c[i][j];
- }
- }
- void mul1()
- {
- for(int i=0;i<N;i++)
- for(int j=0;j<N;j++)
- {
- for(int k=0;k<N;k++)
- {
- dd[i*N+j] += aa[i*N+k] * bb[k*N+j];
- }
- dd[N*i+j] += cc[N*i+j];
- }
- }
- void print()
- {
- ofstream fout;
- fout.open("result.txt");
- if(!fout)
- {
- perror("can not open the file");
- }
- for(int i=0;i<N;i++)
- {
- for(int j=0;j<N;j++)
- {
- fout<<d[i][j]<<" ";
- }
- fout<<endl;
- }
- fout.close();
- }
- int main()
- {
- init1();
- double t = wtime();
- mul1();
- t = wtime()-t;
- printf("computation timing = %10.10f sec\n",t);
- //print();
- return 0;
- }
- #ifndef _WTIME_
- #define _WTIME_
- double wtime();
- #endif
- #include <stdio.h>
- #include <sys/time.h>
- #include <iostream>
- #include <cstdlib>
- double wtime(void)
- {
- double now_time;
- struct timeval etstart;
- struct timezone tzp;
- if(gettimeofday(&etstart,&tzp)==-1)
- {
- perror("Error:calling gettimeofday() not successfully.\n");
- }
- now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;
- return now_time;
- }
- #if 0
- int main()
- {
- double time;
- time = wtime();
- printf("time of day = %10.4f\n",time);
- return 0;
- }
- #endif
makefile:
- target:
- g++ mat_mul.cc wtime.cc
- ./a.out
结果:
2.GPU上执行矩阵相乘以及性能。
代码:
cuda_mat_mul_v1.cu:
- //matrix multiplication with global memory
- #include<iostream>
- #include<fstream>
- #include "wtime.h"
- using namespace std;
- const int BLOCK_SIZE = 16;
- const int GRID_SIZE = 20;
- //D = A * B + C;
- __global__ void mat_mul(int *da,int *db,int *dc,int *dd,int N)
- {
- int row = blockIdx.y * blockDim.y + threadIdx.y;
- int col = blockIdx.x * blockDim.x + threadIdx.x;
- int sum = 0;
- for(int i=0;i<N;i++)
- {
- sum += da[row*N + i] * db[row*i+col];
- }
- dd[row*N + col] = sum + dc[row*N + col];
- }
- int main()
- {
- int N = BLOCK_SIZE * GRID_SIZE;
- int *ha,*hb,*hc,*hd;
- int *da,*db,*dc,*dd;
- double time;
- ha = new int[N*N];
- hb = new int[N*N];
- hc = new int[N*N];
- hd = new int[N*N];
- cudaError_t err;
- //initialize
- for(int i=0;i<N;i++)
- for(int j=0;j<N;j++)
- {
- ha[i*N+j] = 1;
- hb[i*N+j] = 2;
- hc[i*N+j] = 3;
- }
- //malloc</strong>
- cudaMalloc(&da,N*N*sizeof(int));
- cudaMalloc(&db,N*N*sizeof(int));
- cudaMalloc(&dc,N*N*sizeof(int));
- err = cudaMalloc(&dd,N*N*sizeof(int));
- printf("Cuda Malloc C : %s\n",cudaGetErrorString(err));
- //host to device
- cudaMemcpy(da,ha,N*N*sizeof(int),cudaMemcpyHostToDevice);
- cudaMemcpy(db,hb,N*N*sizeof(int),cudaMemcpyHostToDevice);
- cudaMemcpy(dc,hc,N*N*sizeof(int),cudaMemcpyHostToDevice);
- cudaMemcpy(dd,hd,N*N*sizeof(int),cudaMemcpyHostToDevice);
- dim3 threadBlock(BLOCK_SIZE,BLOCK_SIZE);
- dim3 grid(GRID_SIZE,GRID_SIZE);
- //kernel
- time = wtime();
- mat_mul<<<grid,threadBlock>>>(da,db,dc,dd,N);
- printf("Computation time is %10.10f\n",wtime()-time);
- //device to host
- cudaMemcpy(hd,dd,N*N*sizeof(int),cudaMemcpyDeviceToHost);
- //print result to file
- ofstream fout;
- fout.open("result_v1.txt");
- if(!fout)
- {
- cerr<<"open the file error"<<endl;
- exit(-1);
- }
- for(int i=0;i<N;i++)
- {
- for(int j=0;j<N;j++)
- {
- fout<<hd[i*N+j]<<" ";
- }
- fout<<endl;
- }
- delete []ha;delete []hb;delete []hc;delete []hd;
- cudaFree(da);cudaFree(db);cudaFree(dc);cudaFree(dd);
- return 0;
- }
- #include <stdio.h>
- #include <sys/time.h>
- #include <iostream>
- #include <cstdlib>
- double wtime(void)
- {
- double now_time;
- struct timeval etstart;
- struct timezone tzp;
- if(gettimeofday(&etstart,&tzp)==-1)
- {
- perror("Error:calling gettimeofday() not successfully.\n");
- }
- now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;
- return now_time;
- }
- #if 0
- int main()
- {
- double time;
- time = wtime();
- printf("time of day = %10.4f\n",time);
- return 0;
- }
- #endif
- #ifndef _WTIME_
- #define _WTIME_
- double wtime();
- #endif
cuda_wtime.cu:
- #include <stdio.h>
- #include <sys/time.h>
- #include <iostream>
- #include <cstdlib>
- double wtime(void)
- {
- double now_time;
- struct timeval etstart;
- struct timezone tzp;
- if(gettimeofday(&etstart,&tzp)==-1)
- {
- perror("Error:calling gettimeofday() not successfully.\n");
- }
- now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;
- return now_time;
- }
- #if 0
- int main()
- {
- double time;
- time = wtime();
- printf("time of day = %10.4f\n",time);
- return 0;
- }
- #endif
makefile:
- cu:
- nvcc cuda_mat_mul_v1.cu cuda_wtime.cu
- ./a.out
结果:
3.计算性能对比:
矩阵大小
| 1600*1600 |
1200*1200
|
800*800
|
320*320
|
串行时间/s
|
30.9
|
11.49865
|
2.597987
|
0.162311
|
并行时间
| grid=100/block=16 |
grid=75/block=16
|
grid=50/block=16
| grid=20/block=16 |
kernel执行时间/s
|
0.0000319
|
0.0000309944
|
0.0000309944
|
0.0000231266
|
并行计算总时间(分配内存加+数据拷贝+计算)/s
|
0.70796
|
0.439213
|
0.310214
|
0.237676
|
可见,在矩阵规模大的时候非常明显的体现出了GPU强大的计算能力。
注明出处:http://blog.csdn.net/lavorange/article/details/41896591