【CUDA并行编程之四】矩阵相乘

最新推荐文章于 2024-07-05 17:43:50 发布

忆之独秀

最新推荐文章于 2024-07-05 17:43:50 发布

阅读量4.1k

点赞数

分类专栏： HPC 文章标签：并行计算 cuda 矩阵相乘

本文链接：https://blog.csdn.net/lavorange/article/details/41896591

版权

HPC 专栏收录该内容

9 篇文章 6 订阅

订阅专栏

前面介绍了基本的Cuda编程的相关知识，那么这一篇在此基础之上来看看GPU在处理数据计算上的高效能，我们拿矩阵相乘来作为例子。

1.CPU上执行矩阵相乘以及性能。

在CPU上进行矩阵相乘运算的代码：

mat_mul.cc:

//a[i]*b[i] + c[i] = d[i]
#include<iostream>
#include<vector>
#include<map>
#include<fstream>
#include"wtime.h" 

using namespace std;

const int N = 320;

//矩阵有两种表达的方法用二维矩阵或者用一维矩阵表示
int a[N+1][N+1],b[N+1][N+1],c[N+1][N+1],d[N+1][N+1];
int aa[(N+1)*(N+1)],bb[(N+1)*(N+1)],cc[(N+1)*(N+1)],dd[(N+1)*(N+1)];

void init()
{
	for(int i=0;i<N;i++)
		for(int j=0;j<N;j++)
		{
			a[i][j] = 1;
			b[i][j] = 2;
			c[i][j] = 3;
		}
}

void init1()
{
	for(int i=0;i<N;i++)
		for(int j=0;j<N;j++)
		{
			aa[i*N+j] = 1;
			bb[i*N+j] = 2;
			cc[i*N+j] = 3;
		}
}

void mul()
{
	for(int i=0;i<N;i++)	
	  for(int j=0;j<N;j++)
	  {
		for(int k=0;k<N;k++)
		{
			d[i][j] += a[i][k] * b[k][j];
		}
		d[i][j] += c[i][j];
	  }
}

void mul1()
{
	for(int i=0;i<N;i++)	
	  for(int j=0;j<N;j++)
	  {
		for(int k=0;k<N;k++)
		{
			dd[i*N+j] += aa[i*N+k] * bb[k*N+j];
		}
		dd[N*i+j] += cc[N*i+j];
	  }
}

void print()
{
	ofstream fout;
	fout.open("result.txt");
	if(!fout)
	{
		perror("can not open the file");
	}
	for(int i=0;i<N;i++)
	{
	  for(int j=0;j<N;j++)
	  {
	  	  fout<<d[i][j]<<" ";
	  }
	  fout<<endl;
	}
	fout.close();
}

int main()
{
	init1();	
	
	double t = wtime();
	mul1();
	t = wtime()-t;
	printf("computation timing = %10.10f sec\n",t);
	
	//print();

	return 0;
}

wtime.h:

#ifndef _WTIME_
#define _WTIME_

double wtime();

#endif

wtime.cc:

#include <stdio.h>
#include <sys/time.h>
#include <iostream>
#include <cstdlib>

double wtime(void)
{
	double now_time;
	struct timeval etstart;
	struct timezone tzp;

	if(gettimeofday(&etstart,&tzp)==-1)
	{
		perror("Error:calling gettimeofday() not successfully.\n");
	}

	now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;

	return now_time;
}

#if 0
int main()
{
	double time;
	time = wtime();

	printf("time of day = %10.4f\n",time);

	return 0;
}
#endif

makefile:

target:
	g++ mat_mul.cc wtime.cc
	./a.out

结果：

2.GPU上执行矩阵相乘以及性能。

代码：

cuda_mat_mul_v1.cu:

//matrix multiplication with global memory 
#include<iostream>
#include<fstream>
#include "wtime.h"

using namespace std;


const int BLOCK_SIZE = 16;
const int GRID_SIZE = 20;

//D = A * B + C;
__global__ void mat_mul(int *da,int *db,int *dc,int *dd,int N)
{
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;

	int sum = 0;
	for(int i=0;i<N;i++)
	{
		sum += da[row*N + i] * db[row*i+col];
	}
	dd[row*N + col] = sum + dc[row*N + col];
}

int main()
{
	int N = BLOCK_SIZE * GRID_SIZE;
	int *ha,*hb,*hc,*hd;
	int *da,*db,*dc,*dd;
	double time;
	ha = new int[N*N];
	hb = new int[N*N];
	hc = new int[N*N];
	hd = new int[N*N];
	cudaError_t err;

	//initialize
	for(int i=0;i<N;i++)
		for(int j=0;j<N;j++)
		{
			ha[i*N+j] = 1;
			hb[i*N+j] = 2;
			hc[i*N+j] = 3;
		}

	//malloc</strong>
	cudaMalloc(&da,N*N*sizeof(int));
	cudaMalloc(&db,N*N*sizeof(int));
	cudaMalloc(&dc,N*N*sizeof(int));
	err = cudaMalloc(&dd,N*N*sizeof(int));
	printf("Cuda Malloc C : %s\n",cudaGetErrorString(err));

	//host to device
	cudaMemcpy(da,ha,N*N*sizeof(int),cudaMemcpyHostToDevice);
	cudaMemcpy(db,hb,N*N*sizeof(int),cudaMemcpyHostToDevice);
	cudaMemcpy(dc,hc,N*N*sizeof(int),cudaMemcpyHostToDevice);
	cudaMemcpy(dd,hd,N*N*sizeof(int),cudaMemcpyHostToDevice);

	dim3 threadBlock(BLOCK_SIZE,BLOCK_SIZE);
	dim3 grid(GRID_SIZE,GRID_SIZE);
	//kernel
	time = wtime();
	mat_mul<<<grid,threadBlock>>>(da,db,dc,dd,N);
	printf("Computation time is %10.10f\n",wtime()-time);

	//device to host
	cudaMemcpy(hd,dd,N*N*sizeof(int),cudaMemcpyDeviceToHost);

	//print result to file
	ofstream fout;
	fout.open("result_v1.txt");
	if(!fout)  
	{
		cerr<<"open the file error"<<endl;
		exit(-1);
	}
	for(int i=0;i<N;i++)	
	{
		for(int j=0;j<N;j++)
		{
			fout<<hd[i*N+j]<<" ";
		}
		fout<<endl;
	}
	
	delete []ha;delete []hb;delete []hc;delete []hd;
	cudaFree(da);cudaFree(db);cudaFree(dc);cudaFree(dd);

	return 0;
}

cuda_wtime.cu:

#include <stdio.h>
#include <sys/time.h>
#include <iostream>
#include <cstdlib>

double wtime(void)
{
	double now_time;
	struct timeval etstart;
	struct timezone tzp;

	if(gettimeofday(&etstart,&tzp)==-1)
	{
		perror("Error:calling gettimeofday() not successfully.\n");
	}

	now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;

	return now_time;
}

#if 0
int main()
{
	double time;
	time = wtime();

	printf("time of day = %10.4f\n",time);

	return 0;
}
#endif

wtime.h:

#ifndef _WTIME_
#define _WTIME_

double wtime();

#endif

cuda_wtime.cu:

#include <stdio.h>
#include <sys/time.h>
#include <iostream>
#include <cstdlib>

double wtime(void)
{
	double now_time;
	struct timeval etstart;
	struct timezone tzp;

	if(gettimeofday(&etstart,&tzp)==-1)
	{
		perror("Error:calling gettimeofday() not successfully.\n");
	}

	now_time = ( (double)etstart.tv_sec ) + ((double)etstart.tv_usec) / 1000000.0;

	return now_time;
}

#if 0
int main()
{
	double time;
	time = wtime();

	printf("time of day = %10.4f\n",time);

	return 0;
}
#endif

makefile:

cu:
	nvcc cuda_mat_mul_v1.cu cuda_wtime.cu
	./a.out

结果：

3.计算性能对比：

矩阵大小	*16001600**	1200*1200	800*800	320*320
串行时间/s	30.9	11.49865	2.597987	0.162311
并行时间	grid=100/block=16	grid=75/block=16	grid=50/block=16	grid=20/block=16
kernel执行时间/s	0.0000319	0.0000309944	0.0000309944	0.0000231266
并行计算总时间（分配内存加+数据拷贝+计算）/s	0.70796	0.439213	0.310214	0.237676