//share memory demo
//实现C[MH,NW]=A[MH,MW]B[MW,NW]
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#define N 8 //A、B、C为方阵时的矩阵宽度
//非方阵的设置如下:
#define MH 16 //A的行数
#define MW 32 //A的列数=B的行数
#define NW 16 //B的列数
//C的行数=MH,列数=NW
#define THREAD_WIDTH 16 //每个Block中Thread个数,3个维的宽度乘积不能超过1024
#define TILE_WIDTH 16//瓦片宽度,应保证各矩阵的宽高是TILE_WIDTH的整数倍
__global__ void MatrixMulOptimazationKernel(const float* Md, const float* Nd, float* Pd, int mh, int mw, int nw)
{
__shared__ float Mds[TILE_WIDTH][TILE_WIDTH];//在瓦片内共享的share memory
__shared__ float Nds[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int row = by * TILE_WIDTH + ty;//Pd矩阵中某点的行号,每个瓦片放在一个单独的Block中,所以为by*TILE_WIDTH
int col = bx * TILE_WIDTH + tx;
int loop = (mw + TILE_WIDTH - 1) / TILE_WIDTH;
if( row < mh && col < nw)
{
float sum = 0;
for (int m = 0; m < loop; ++m)
{
int mx = m * TILE_WIDTH + tx;
int ny = m * TILE_WIDTH + ty;
if(mx < mw && ny < mw)
{
Mds[ty][tx] = Md[row * mw + (m * TILE_WIDTH + tx)];
Nds[ty][tx] = Nd[col + (m * TILE_WIDTH + ty) * nw];
__syncthreads();
for (int k = 0; k < TILE_WIDTH; ++k)
sum += Mds[ty][k] * Nds[k][tx];
__syncthreads();
}
}
Pd[ row * nw + col] = sum;
}
}
int main()
{
int mh=MH;
int mw=MW;
int nw=NW;
//int size=N*N*sizeof(float);
int asize = mh * mw * sizeof(float);
int bsize = mw * nw * sizeof(float);
int csize = mh * nw * sizeof(float);
float*a=(float*)malloc(asize);
float*b=(float*)malloc(bsize);
float*c=(float*)malloc(csize);
float*da,*db,*dc;
float time;
cudaMalloc((void**)&da,asize);
cudaMalloc((void**)&db,bsize);
cudaMalloc((void**)&dc,csize);
for(int i=0;i<mh;i++)
{
for(int j=0;j<mw;j++)
{
a[i*mw+j]=1.0;//矩阵a初始值
}
}
for(int i=0;i<mw;i++)
{
for(int j=0;j<nw;j++)
{
b[i*nw+j]=1.0;//矩阵b初始值
}
}
cudaMemcpy(da,a,asize,cudaMemcpyHostToDevice);
cudaMemcpy(db,b,bsize,cudaMemcpyHostToDevice);
//开始计算时间
cudaEvent_t start,stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start,0);
dim3 threadperblock(THREAD_WIDTH,THREAD_WIDTH,1);
dim3 blockpergrid((nw + threadperblock.x - 1)/threadperblock.x, (mh + threadperblock.y - 1)/threadperblock.x, 1);
MatrixMulOptimazationKernel<<<blockpergrid, threadperblock>>>(da, db,dc, mh,mw,nw);
//结束计算时间
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
cudaMemcpy(c,dc,csize,cudaMemcpyDeviceToHost);
printf("A=\n");
for(int i=0;i<mh;i++)
{
for(int j=0;j<mw;j++)
{
printf("%.2f ",a[i*mw+j]);
}
printf("\n");
}
printf("B=\n");
for(int i=0;i<mw;i++)
{
for(int j=0;j<nw;j++)
{
printf("%.2f ",b[i*nw+j]);
}
printf("\n");
}
printf("C=\n");
for(int i=0;i<mh;i++)
{
for(int j=0;j<nw;j++)
{
printf("%.2f ",c[i*nw+j]);
}
printf("\n");
}
printf("the GPU performing time is %f ms \n",time);
free(a);
free(b);
free(c);
cudaFree(da);
cudaFree(db);
cudaFree(dc);
}
【CUDA笔记1】share memory优化
最新推荐文章于 2022-10-03 15:34:24 发布