并行程序设计原理 HW1
冯浩然 1600013009
1
Intro
- 本次作业完成过程中,我从从初学到探索,由简单到优化地尝试了三种实现矩阵转置的方法。本报告将依次介绍三种实现,并展示和分析他们的效率
2
实现
1. 外围函数
参考CUDA手册给出的例子和作业要求,我们假设矩阵为 1024∗1024 1024 ∗ 1024 的
float
型矩阵。首先给出生成矩阵的generator()
函数和测试转置结果正确性的check()
函数其中
generator()
函数中生成随机数的部分参考了https://zhidao.baidu.com/question/1514652593460316220.html中的回答,check()
函数采用不考虑性能的简单实现
/*
* generate random cuda matrix with "curand.h", and copy back to the host as a normal matrix
* cm represents "cuda matrix", m represents "matrix"
*/
void generator(float *cm, float *m)
{
curandGenerator_t gen;
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, 1234ULL);
curandGenerateUniform(gen, cm, N * N);
curandDestroyGenerator(gen);
cudaMemcpy(m, cm, N * N * sizeof(float), cudaMemcpyDeviceToHost);
}
/*
* check if the result is correct
* dst represents the tranposed, src represents the previous
*/
bool check(float *dst, float *src)
{
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++){
if (src[i * N + j] != dst[j * N + i])
return false;
}
return true;
}
我们在
main()
中添加计时的部分,以衡量方法的时间效率。main()
函数主体实现如下其中计时部分参考了https://blog.csdn.net/litdaguang/article/details/50520549中的GPU计时函数部分
省略了执行矩阵转置的
`matrix_trans()
函数和一些变量准备
#include <stdio.h>
#include <stdlib.h>
#include <ctime>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <curand.h>
#include <curand_kernel.h>
using namespace std;
#pragma comment(lib, "curand.lib")
#define N 1024
#define TILE 32
/*
* generate random cuda matrix with "curand.h", and copy back to the host as a normal matrix
* cm represents "cuda matrix", m represents "matrix"
*/
void generator(float *cm, float *m)
{
curandGenerator_t gen;
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, 1234ULL);
curandGenerateUniform(gen, cm, N * N);
curandDestroyGenerator(gen);
cudaMemcpy(m, cm, N * N * sizeof(float), cudaMemcpyDeviceToHost);
}
/*
* check if the result is correct
* dst represents the tranposed, src represents the previous
*/
bool check(float *dst, float *src)
{
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++){
if (src[i * N + j] != dst[j * N + i])
return false;
}
return true;
}
int main()
{
float *cdst, *csrc;
float *dst, *src;
cudaMalloc(&cdst, N * N * sizeof(float));
cudaMalloc(&csrc, N * N * sizeof(float));
dst = (float *)malloc(N * N * sizeof(float));
src = (float *)malloc(N * N * sizeof(float));
generator(csrc, src);
/*initialize varibles like dimBlock and dimGrid there*/
/*time record*/
cudaEvent_t start, stop;
float elapsedTime = 0.0;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
/*executive the matrix_tran() function there*/
/*end time record*/
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaMemcpy(dst, cdst, N * N * sizeof(float), cudaMemcpyDeviceToHost);
if (check(dst, src))
printf("correct answer!\n");
else
printf("wrong answer!\n");
printf("the executive takes %f s on average.\n", elapsedTime);
free(dst);
free(src);
cudaFree(cdst);
cudaFree(csrc);
return 0;
}
2.主体转置函数
2.1 naive方法
矩阵共 1024∗1024 1024 ∗ 1024 个元素,于是我们将一个
`Grid
分配为1024个Block
,每个Block
负责一行1024个元素,每个Block
中1024个Thread
,每个Thread
负责一个元素/* * transpose matrix src, and store the result in matrix dst * dst represents the tranposed, src represents the previous * naive implementation: one block takes one row, one threads takes one elements */ __global__ void matrix_trans_1(float *dst, float *src) { int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; dst[i * N + j] = src[j * N + i]; } int main() { dim3 dimBlock(N, 1); dim3 dimGrid(1, N); /*...*/ naive_trans_1<<<dimGrid, dimBlock>>>(cdst, csrc); /*...*/ }
2.2 优化step1
考虑
shared memory
的性质。我们不妨考虑将矩阵切为一个个 32∗32 32 ∗ 32 的单位块进行移动,提高访问的聚合性,从而利用shared memory
带来的局部性。参考了https://blog.csdn.net/bruce_0712/article/details/65018223中有关内容
/* * transpose matrix src, and store the result in matrix dst * dst represents the tranposed, src represents the previous * optimized step1: a unit is a 32 * 32 matrix, move unit by unit */ __global__ void matrix_trans_2(float *dst, float *src) { __shared__ float tile[TILE * TILE]; int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int ind = threadIdx.x * TILE + threadIdx.y; tile[ind] = src[j * N + i]; __syncthreads(); dst[i * N + j] = tile[ind]; } int main() { dim3 dimBlock(TILE, TILE); dim3 dimGrid(N / TILE, N / TILE); /*...*/ naive_trans_2<<<dimGrid, dimBlock>>>(cdst, csrc); /*...*/ }
然而进行测试后,发现
matrix_trans_2()
效率还不如matrix_trans_1
,猜测原因是我们对 32∗32 32 ∗ 32 的unit对聚合性和局部性的利用并不好。于是我们将在下一步优化中调整分块,以求得更好的聚合性和局部性naive版本这么快也是有原因的:每个warp是连续访问的,程序有很好的聚合性
2.3 优化step2
- 仍保持
32∗32
32
∗
32
的分块,每个
block
管理一个分块,但改进线程分配 - 每个block中有
8∗32
8
∗
32
个
thread
,每个thread
每次读 4∗1 4 ∗ 1 的4个元素到tile中,然后直接转置后放入dst
矩阵中,以实现更好的凝聚访问
/*
* transpose matrix src, and store the result in matrix dst
* dst represents the tranposed, src represents the previous
* optimized step2: a unit is a 32 * 32 matrix, move by 4 * 1 elements
*/
__global__ void matrix_trans_3(float *dst, float *src)
{
__shared__ float tile[TILE][TILE];
int i = blockIdx.x * TILE + threadIdx.x;
int j = blockIdx.y * TILE + threadIdx.y * 4;
int ind = j * N + i;
int tile_i = threadIdx.y * 4;
int tile_j = threadIdx.x;
for (i = 0; i < 4; i++){
tile[tile_j][tile_i] = src[ind];
tile_i++;
ind += N;
}
__syncthreads();
i = blockIdx.y * TILE + threadIdx.x;
j = blockIdx.x * TILE + threadIdx.y * 4;
ind = j * N + i;
tile_i = threadIdx.x;
tile_j = threadIdx.y * 4;
for (int i = 0; i < 4; i++){
dst[ind] = tile[tile_j][tile_i];
tile_j++;
ind += N;
}
}
int main()
{
dim3 dimBlock(TILE, TILE / 4);
dim3 dimGrid(N / TILE, N / TILE);
/*...*/
naive_trans_3<<<dimGrid, dimBlock>>>(cdst, csrc);
/*...*/
}
3
执行及性能
三种实现分别对应提交的
matrix_trans_1.cu, matrix_trans_2.cu, matrix_trans_3.cu
文件运行情况如图
注意到naive版本性能就很不错,step1优化后性能反而下降,step2优化后性能最佳,性能约是naive版本的3倍
分析原因是
step1时,我们对 32∗32 32 ∗ 32 的unit对聚合性和局部性的利用并不好。于是我们将在下一步优化中调整分块,以求得更好的聚合性和局部性
step2时达到了比较好的聚合性和局部性
4
特别注释
代码位置为
manycore@master:/home/manycore/users/Feng.Haoran 中的matrix_trans_1, 2, 3, hw1_1, 2, 3是编译完成的可执行文件
(1是naive实现,2是step1优化后,3是step2优化后)