在上一篇文章:第一个Cuda程序,矩阵相乘代码,我们设计了一种并行的矩阵乘法程序,效果和使用CPU计算的一样,但时间有了很大的降低,然而,这只是最基本的一种方法,事实上我们完全可以让程序变得更快!
仔细看看,会发现我们使用的是global memory,而share memory的访问速度要远远大于global memory,所以我们将使用share memory优化矩阵乘法,让程序更快!
#include <stdio.h>
#include<stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <device_functions.h>
# define BLOCK_SIZE 8
# define M 6
# define N 8
# define K 6
__managed__ float a[M * N];
__managed__ float b[N*K];
__managed__ float c_gpu[M * K];
__managed__ float c_cpu[M * K];
__global__ void gpu_matrix(float* a, float* b, float* c, const int m, const int n, const int k)
{
__shared__ float sub_a[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float sub_b[BLOCK_SIZE][BLOCK_SIZE];
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
float temp = 0.0;
int step, i;
for (step = 0; step <N / BLOCK_SIZE; step++)
{
if ((step * BLOCK_SIZE + threadIdx.x) >= N || y >= M)
{
sub_a[threadIdx.y][threadIdx.x] = 0.0;
}
else
{
sub_a[threadIdx.y][threadIdx.x] = a[y * N + (step * BLOCK_SIZE + threadIdx.x)];
}
if ((step * BLOCK_SIZE + threadIdx.y) >= N || x >= K)
{
sub_b[threadIdx.y][threadIdx.x] = 0.0;
}
else
{
sub_b[threadIdx.y][threadIdx.x] = b[(step * BLOCK_SIZE + threadIdx.y) * K + x];
}
__syncthreads();
for (i = 0; i < BLOCK_SIZE; i++)
{
temp = temp + sub_a[threadIdx.y][i] * sub_b[i][threadIdx.x];
}
__syncthreads();
if (x < K && y < M)
{
c[y * K + x] = temp;
}
}
}
void cpu_matrix(float* a, float* b, float* c, const int m, const int n, const int k)
{
int y, x, step;
float temp;
for (y = 0; y < M; y++)
{
for (x = 0; x < K; x++)
{
temp = 0;
for (step = 0; step < N; step++)
{
temp += a[y * N + step] * b[step * K + x];
}
c[y * K + x] = temp;
}
}
}
int main()
{
int x,y;
float item1;
//初始化矩阵
for (y = 0; y < M; y++)
{
for (x = 0; x < N; x++)
{
item1 = x + y;
a[y * N + x] = item1;
}
}
for (y = 0; y < N; y++)
{
for (x = 0; x < K; x++)
{
item1 = x + y;
b[y * K + x] = item1;
}
}
printf("-----------------两个矩阵-#--------------\n");
for (y = 0; y < M; y++)
{
for (x = 0; x < N; x++)
{
printf("%f ", a[y * N + x]);
}
printf("\n");
}
for (y = 0; y < N; y++)
{
for (x = 0; x < K; x++)
{
printf("%f ", b[y * K + x]);
}
printf("\n");
}
unsigned int grid_rows = (M + BLOCK_SIZE - 1) / BLOCK_SIZE;
unsigned int grid_cols = (K + BLOCK_SIZE - 1) / BLOCK_SIZE;
dim3 dimGrid(grid_rows, grid_cols);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
gpu_matrix <<<dimGrid, dimBlock>>> (a, b, c_gpu, M, N, K);
cudaDeviceSynchronize();
cpu_matrix(a, b, c_cpu, M, N, K);
//
//打印cpu计算结果
printf("---------------cpu计算结果---------#------------------\n");
for (y = 0; y < M; y++)
{
for (x = 0; x < K; x++)
{
printf("%f ", c_cpu[y * K + x]);
}
printf("\n");
}
printf("------------------gpu计算结果------#------------------\n");
//打印GPU计算结果
for (y = 0; y < M; y++)
{
for (x = 0; x < K; x++)
{
printf("%f ", c_gpu[y * K + x]);
}
printf("\n");
}
return 0;
}
-----------------两个矩阵-#--------------
0.000000 1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000
1.000000 2.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.000000
2.000000 3.000000 4.000000 5.000000 6.000000 7.000000 8.000000 9.000000
3.000000 4.000000 5.000000 6.000000 7.000000 8.000000 9.000000 10.000000
4.000000 5.000000 6.000000 7.000000 8.000000 9.000000 10.000000 11.000000
5.000000 6.000000 7.000000 8.000000 9.000000 10.000000 11.000000 12.000000
0.000000 1.000000 2.000000 3.000000 4.000000 5.000000
1.000000 2.000000 3.000000 4.000000 5.000000 6.000000
2.000000 3.000000 4.000000 5.000000 6.000000 7.000000
3.000000 4.000000 5.000000 6.000000 7.000000 8.000000
4.000000 5.000000 6.000000 7.000000 8.000000 9.000000
5.000000 6.000000 7.000000 8.000000 9.000000 10.000000
6.000000 7.000000 8.000000 9.000000 10.000000 11.000000
7.000000 8.000000 9.000000 10.000000 11.000000 12.000000
---------------cpu计算结果---------#------------------
140.000000 168.000000 196.000000 224.000000 252.000000 280.000000
168.000000 204.000000 240.000000 276.000000 312.000000 348.000000
196.000000 240.000000 284.000000 328.000000 372.000000 416.000000
224.000000 276.000000 328.000000 380.000000 432.000000 484.000000
252.000000 312.000000 372.000000 432.000000 492.000000 552.000000
280.000000 348.000000 416.000000 484.000000 552.000000 620.000000
------------------gpu计算结果------#------------------
140.000000 168.000000 196.000000 224.000000 252.000000 280.000000
168.000000 204.000000 240.000000 276.000000 312.000000 348.000000
196.000000 240.000000 284.000000 328.000000 372.000000 416.000000
224.000000 276.000000 328.000000 380.000000 432.000000 484.000000
252.000000 312.000000 372.000000 432.000000 492.000000 552.000000
280.000000 348.000000 416.000000 484.000000 552.000000 620.000000
到了这里,我们能够使得矩阵乘法变得相当快(与仅使用CPU计算相比),这在实际应用中非常重要,尤其是数据计算量非常大的情况。
也许到了这里,这两个程序你并没有完全了解,但,不要担心,先把这些代码运行一下,体会使用GPU计算的魅力,为以后的学习打下基础。