矩阵相乘的图示如下:
CPU代码
显然,最直接的矩阵相乘算法,需要三重循环,代码如下:
void matrix_mul_host(FLOAT* M, FLOAT* N, FLOAT* P, int width)
{
for (int i = 0; i < width; ++i)
{
for (int j = 0; j < width; ++j)
{
float sum = 0, a, b;
for (int k = 0; k < width; ++k)
{
a = M[i*width + k];
b = N[k*width + j];
sum += a * b;
}
P[i*width + j] = sum;
}
}
}
GPU代码
而对于GPU而言,可以让每个thread计算一个目标矩阵元素的结果,假如使用1x1 grid,NxN block,代码如下:
__global__ void matrix_mul_device(FLOAT* M, FLOAT* N, FLOAT* P, int width)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
float pvalue = 0, mElem, nElem;
for (int i = 0; i < width; ++i)
{
mElem = M[ty*width + i];
nElem = N[i*width + tx];
pvalue += mElem * nElem;
}
P[ty*width + tx] = pvalue;
}
完整代码:
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <windows.h>
typedef float FLOAT;
double get_time();
void warm_up();
void matrix_mul_host(FLOAT* M, FLOAT* N, FLOAT* P, int width);
__global__ void matrix_mul_device(FLOAT* M, FLOAT* N, FLOAT* P, int width);
// <2d grid, 1d block>
#define get_tid() ((blockIdm.y*gridDim.x + blockIdm.x)*blockDim.x + threadIdm.x)
#define get_bid() (blockIdm.y*gridDim.x + blockIdm.x)
double get_time()
{
LARGE_INTEGER timer;
static LARGE_INTEGER fre;
static int init = 0;
double t;
if (init != 1)
{
QueryPerformanceFrequency(&fre);
init = 1;
}
QueryPerformanceCounter(&timer);
t = timer.QuadPart * 1. / fre.QuadPart;
return t;
}
// warm up gpu
__global__ void warmup_knl(void)
{
int i, j;
i = 1;
j = 1;
i = i + j;
}
void warm_up()
{
int i = 0;
for (; i < 8; ++i)
{
warmup_knl << <1, 256 >> > ();
}
}
// host code
void matrix_mul_host(FLOAT* M, FLOAT* N, FLOAT* P, int width)
{
for (int i = 0; i < width; ++i)
{
for (int j = 0; j < width; ++j)
{
float sum = 0, a, b;
for (int k = 0; k < width; ++k)
{
a = M[i*width + k];
b = N[k*width + j];
sum += a * b;
}
P[i*width + j] = sum;
}
}
}
// device code
__global__ void matrix_mul_device(FLOAT* M, FLOAT* N, FLOAT* P, int width)
{
int tx = threadIdx.x;
int ty = threadIdx.y;
float pvalue = 0, mElem, nElem;
for (int i = 0; i < width; ++i)
{
mElem = M[ty*width + i];
nElem = N[i*width + tx];
pvalue += mElem * nElem;
}
P[ty*width + tx] = pvalue;
}
int main()
{
int N = 32;
int nbytes = N * N * sizeof(FLOAT);
dim3 dimGrid(1, 1);
dim3 dimBlock(N, N);
cudaError_t cudaStatus = cudaSetDevice(0);
FLOAT* dm = NULL, *hm = NULL;
FLOAT* dn = NULL, *hn = NULL;
FLOAT* dp = NULL, *hp = NULL;
int iter = 100000;
int i;
double th, td;
warm_up();
/* allocate gpu memory */
cudaMalloc((void**)&dm, nbytes);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc dm failed!");
return -4;
}
cudaMalloc((void**)&dn, nbytes);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc dn failed!");
return -4;
}
cudaMalloc((void**)&dp, nbytes);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc dp failed!");
return -4;
}
if (dm == NULL || dn == NULL || dp == NULL)
{
printf("could not allocate gpu memory/n");
return -1;
}
hm = (FLOAT*)malloc(nbytes);
hn = (FLOAT*)malloc(nbytes);
hp = (FLOAT*)malloc(nbytes);
if (hm == NULL || hn == NULL || hp == NULL)
{
printf("could not allocate cpu memory/n");
return -2;
}
/* init */
for (i = 0; i < N*N; ++i)
{
hm[i] = i % 2 == 0 ? 1 : -1;
hn[i] = 1;
hp[i] = 2;
}
/* copy data to gpu*/
cudaMemcpy(dm, hm, nbytes, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy dm failed!");
return -4;
}
cudaMemcpy(dn, hn, nbytes, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy dn failed!");
return -4;
}
cudaMemcpy(dp, hp, nbytes, cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy dp failed!");
return -4;
}
warm_up();
// call for gpu
cudaThreadSynchronize();
td = get_time();
for (i = 0; i < iter; ++i) matrix_mul_device << <dimGrid, dimBlock >> > (dm, dn, dp, N);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "matmulKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
return -5;
}
cudaThreadSynchronize();
td = get_time() - td;
// call for cpu
th = get_time();
for (i = 0; i < iter; ++i) matrix_mul_host(hm, hn, hp, N);
th = get_time() - th;
printf("GPU time: %.8f, CPU time: %.6f, Speedup: %g\n", td, th, th / td);
FLOAT* hp2 = (FLOAT*)malloc(nbytes);
for (int i = 0; i < N*N; ++i)
{
hp2[i] = 0;
}
cudaMemcpy(hp2, dp, nbytes, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!!!");
return -3;
}
// check final results, should be zeros
float xh = 0, xd = 0;
for (int i = 0; i < N*N; ++i)
{
xh += hp[i];
xd += hp2[i];
}
printf("%.6f, %.6f", xh, xd);
// free
cudaFree(dm);
cudaFree(dn);
cudaFree(dp);
free(hm);
free(hn);
free(hp);
return 0;
}
实验结果:
GPU time: 2.32100360, CPU time: 10.617398, Speedup: 4.57449
0.000000, 0.000000
注意
本例中仅使用了一维grid和二维block,存在诸多可以改进的地方:
1. 二维block受限于block的最大线程数目(仅使用了一个block),因此无法处理大的矩阵相乘。以RTX TITAN为例,最大block线程数是1024,因此本例中是32x32的矩阵相乘。如果超过了最大线程数目,会报如下错误:
launch failed: invalid configuration argument
2. kernel 函数中存在很多global memory的读写操作,影响了处理速度。