在CUDA编程04——矩阵相乘 (去除长度限制)CUDA编程03——矩阵相乘CUDA编程04——矩阵相乘 (去除长度限制) 中,另外一个问题是kernel 函数中存在很多global memory的读写操作。这些操作主要是一些重复读取,例如在计算目标矩阵C中的一列元素时,每一个元素的计算都需要读取矩阵A中的一行和B中的一列。行每次都在变化,但是列每次都是一样的。同理计算目标矩阵C中的一行元素时也一样。因此,如果可以共享这些部分,则可以减少对于global memory的访问,提升吞吐量。
Shared Memory
GPU上的memory有两种:
· On-board memory
· On-chip memory
Global memory是on-board memory,Shared memory是on-chip memory。因为是片上的,shared memory比本地和全局内存快得多。实际上,共享内存延迟大约比未缓存的全局内存延迟低 100 倍(前提是线程之间没有内存冲突)。共享内存是按线程块分配的,因此块中的所有线程都可以访问同一共享内存。线程可以访问由同一线程块中的其他线程从全局内存加载的共享内存中的数据。此功能(与线程同步结合)有许多用途,例如用户管理的数据缓存、高性能的协作并行算法(例如并行缩减),以及在不可能实现全局内存合并的情况下促进全局内存合并。
结构图:
GPU代码:
// device code
__global__ void matrix_mul_device(FLOAT* M, FLOAT* N, FLOAT* P, int width)
{
// shared memory
__shared__ FLOAT Mds[TILE_WIDTH][TILE_WIDTH];
__shared__ FLOAT Nds[TILE_WIDTH][TILE_WIDTH];
int tx = threadIdx.x, ty = threadIdx.y, bx = blockIdx.x, by = blockIdx.y;
int row = by*TILE_WIDTH + ty;
int col = bx*TILE_WIDTH + tx;
float sum = 0;
for (int i = 0; i < width / TILE_WIDTH; ++i)
{
// every thread write corresponding value to shared memory
Mds[ty][tx] = M[row*width + i*TILE_WIDTH + tx];
Nds[ty][tx] = M[col + width*(ty + TILE_WIDTH*i)];
// wait for all threads
__syncthreads();
for (int j = 0; j < TILE_WIDTH; ++j)
{
sum += Mds[ty][j] * Nds[j][tx];
__syncthreads();
}
}
P[row*width + col] = sum;
}
上面代码的逻辑是:在一个block内,行列数据都是在共享内存上的,因此计算的时候就只需要访问共享内存,而不需要访问global memory。
__syncthreads()可以让同一个block的线程同步。
注意:两层循环的次数是width/TILE_WIDTH*TILE_WIDTH=width。
步骤:
1. 从global memory中读取第一个block中的数据到共享内容;
2. 同步等待所有线程读取完毕;
3. 对当前block内的计算(直接从shared memory中读取);
4. 同步等待所有当前block内的对应值(临时的)计算完毕;
5. 写回结果到global memory。
完整代码
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <windows.h>
typedef float FLOAT;
double get_time();
void warm_up();
void matrix_mul_host(FLOAT* M, FLOAT* N, FLOAT* P, int width);
__global__ void matrix_mul_device(FLOAT* M, FLOAT* N, FLOAT* P, int width);
// <2d grid, 1d block>
#define get_tid() ((blockIdm.y*gridDim.x + blockIdm.x)*blockDim.x + threadIdm.x)
#define get_bid() (blockIdm.y*gridDim.x + blockIdm.x)
#define MATRIX_DIM 1024
#define TILE_WIDTH 16
double get_time()
{
LARGE_INTEGER timer;
static LARGE_INTEGER fre;
static int init = 0;
double t;
if (init != 1)
{
QueryPerformanceFrequency(&fre);
init = 1;
}
QueryPerformanceCounter(&timer);
t = timer.QuadPart * 1. / fre.QuadPart;
return t;
}
// warm up gpu
__global__ void warmup_knl(void)
{
int i, j;
i = 1;
j = 1;
i = i + j;
}
void warm_up()
{
int i = 0;
for (; i < 8; ++i)
{
warmup_knl << <1, 256 >> > ();
}
}
// host code
void matrix_mul_host(FLOAT* M, FLOAT* N, FLOAT* P, int width)
{
for (int i = 0; i < width; ++i)
{
for (int j = 0; j < width; ++j)
{
float sum = 0, a, b;
for (int k = 0; k < width; ++k)
{
a = M[i*width + k];
b = N[k*width + j];
sum += a * b;
}
P[i*width + j] = sum;
}
}
}
// device code
__global__ void matrix_mul_device(FLOAT* M, FLOAT* N, FLOAT* P, int width)
{
// shared memory
__shared__ FLOAT Mds[TILE_WIDTH][TILE_WIDTH];
__shared__ FLOAT Nds[TILE_WIDTH][TILE_WIDTH];
int tx = threadIdx.x, ty = threadIdx.y, bx = blockIdx.x, by = blockIdx.y;
int row = by*TILE_WIDTH + ty;
int col = bx*TILE_WIDTH + tx;
float sum = 0;
for (int i = 0; i < width / TILE_WIDTH; ++i)
{
// every thread write corresponding value to shared memory
Mds[ty][tx] = M[row*width + i*TILE_WIDTH + tx];
Nds[ty][tx] = M[col + width*(ty + TILE_WIDTH*i)];
// wait for all threads
__syncthreads();
for (int j = 0; j < TILE_WIDTH; ++j)
{
sum += Mds[ty][j] * Nds[j][tx];
__syncthreads();
}
}
P[row*width + col] = sum;
}
int main()
{
int nbytes = MATRIX_DIM * MATRIX_DIM * sizeof(FLOAT);
int n_grid = int(MATRIX_DIM / TILE_WIDTH);
dim3 dimGrid(n_grid, n_grid);
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH);
cudaError_t cudaStatus = cudaSetDevice(0);
// memory pointers
FLOAT* dm = NULL, *hm = NULL;
FLOAT* dn = NULL, *hn = NULL;
FLOAT* dp = NULL, *hp = NULL;
int iter = 1;
int i;
double th, td;
warm_up();
/* allocate gpu memory */
cudaMalloc((void**)&dm, nbytes);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc dm failed!");
goto Error;
}
cudaMalloc((void**)&dn, nbytes);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc dn failed!");
goto Error;
}
cudaMalloc((void**)&dp, nbytes);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc dp failed!");
goto Error;
}
if (dm == NULL || dn == NULL || dp == NULL)
{
printf("could not allocate gpu memory/n");
goto Error;
}
hm = (FLOAT*)malloc(nbytes);
hn = (FLOAT*)malloc(nbytes);
hp = (FLOAT*)malloc(nbytes);
if (hm == NULL || hn == NULL || hp == NULL)
{
printf("could not allocate cpu memory/n");
goto Error;
}
/* init */
for (i = 0; i < MATRIX_DIM*MATRIX_DIM; ++i)
{
hm[i] = i % 2 == 0 ? 1 : -1;
hn[i] = 1;
hp[i] = 2;
}
/* copy data to gpu*/
cudaMemcpy(dm, hm, nbytes, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy dm failed!");
goto Error;
}
cudaMemcpy(dn, hn, nbytes, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy dn failed!");
goto Error;
}
cudaMemcpy(dp, hp, nbytes, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy dp failed!");
goto Error;
}
warm_up();
// call for gpu
cudaThreadSynchronize();
td = get_time();
for (i = 0; i < iter; ++i) matrix_mul_device << <dimGrid, dimBlock >> > (dm, dn, dp, MATRIX_DIM);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "matmulKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
cudaThreadSynchronize();
td = get_time() - td;
// call for cpu
th = get_time();
for (i = 0; i < iter; ++i) matrix_mul_host(hm, hn, hp, MATRIX_DIM);
th = get_time() - th;
printf("GPU time: %.8f, CPU time: %.6f, Speedup: %g\n", td, th, th / td);
FLOAT* hp2 = (FLOAT*)malloc(nbytes);
for (int i = 0; i < MATRIX_DIM*MATRIX_DIM; ++i) hp2[i] = 0;
cudaMemcpy(hp2, dp, nbytes, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!!!");
goto Error;
}
// check final results, should be zeros
float xh = 0, xd = 0;
for (int i = 0; i < MATRIX_DIM*MATRIX_DIM; ++i)
{
xh += hp[i];
xd += hp2[i];
}
printf("%.6f, %.6f", xh, xd);
Error:
// free
cudaFree(dm);
cudaFree(dn);
cudaFree(dp);
free(hm);
free(hn);
free(hp);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
return 0;
}
实验结果
GPU time: 0.02112680, CPU time: 6.339133, Speedup: 300.052
0.000000, 0.000000
从实验结果看,速度提升不如上一篇的方式,这主要是因为线程需要同步带来的延迟。