优化GPU原则与优化步骤
优化目标:
- Solve bigger problems
- Solve more problems
优化原则
- 最大化算术强度
- 减少内存操作花费的时间
- 合并全局内存访问
- 避免线程发散
- 把高频使用数据移到共享内存
\tfrac{Math}{Menmory}
优化等级
- 选择好的算法
- 基本的高效代码的法则
- 体系机构具体优化
- 指令级的操作微观优化
优化的流程与步骤
- 分析:
- 分析程序瓶颈、什么地方需要做并行、能够提供的资源
- 并行:
- Libraries:OpenMP(CPU),OpenACC
- Directive
- Pick an algorithm
- 优化:
- 测量内存、带宽和占用率等指标
APOD–分析
-
不要依赖直觉!
-
分析工具:
- gProf
- VTune
- VerySleepy
APOD–并行
-
以矩阵转置为例:
- 单个线程处理
- 矩阵每一行作为一个线程处理
-
将程序并行化提高优化为:
- 每个线程处理一个元素
-
进一步优化
- 将数据划分利用共享内存
- 将数据划分利用共享内存
-
进进一步优化,K=16(缩小)
六步优化代码
- 第一步:单个线程处理
// to be launched on a single thread
__global__ void
transpose_serial(float in[], float out[])
{
for(int j=0; j < N; j++)
for(int i=0; i < N; i++)
out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
}
- 第二步:线程并行处理一行
// to be launched with one thread per row of output matrix
__global__ void
transpose_parallel_per_row(float in[], float out[])
{
int i = threadIdx.x;
for(int j=0; j < N; j++)
out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
}
- 第三步:线程并行处理单个元素
// to be launched with one thread per element, in KxK threadblocks
// thread (x,y) in grid writes element (i,j) of output matrix
__global__ void
transpose_parallel_per_element(float in[], float out[])
{
int i = blockIdx.x * K + threadIdx.x;
int j = blockIdx.y * K + threadIdx.y;
out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
}
- 第四步:使用共享内存
// to be launched with one thread per element, in (tilesize)x(tilesize) threadblocks
// thread blocks read & write tiles, in coalesced fashion
// adjacent threads read adjacent input elements, write adjacent output elmts
__global__ void
transpose_parallel_per_element_tiled(float in[], float out[])
{
// (i,j) locations of the tile corners for input & output matrices:
int in_corner_i = blockIdx.x * K, in_corner_j = blockIdx.y * K;
int out_corner_i = blockIdx.y * K, out_corner_j = blockIdx.x * K;
int x = threadIdx.x, y = threadIdx.y;
__shared__ float tile[K][K];
// coalesced read from global mem, TRANSPOSED write into shared mem:
tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
__syncthreads();
// read from shared mem, coalesced write to global mem:
out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
}
- 第五步:数据分块增多
// to be launched with one thread per element, in (tilesize)x(tilesize) threadblocks
// thread blocks read & write tiles, in coalesced fashion
// adjacent threads read adjacent input elements, write adjacent output elmts
__global__ void
transpose_parallel_per_element_tiled16(float in[], float out[])
{
// (i,j) locations of the tile corners for input & output matrices:
int in_corner_i = blockIdx.x * 16, in_corner_j = blockIdx.y * 16;
int out_corner_i = blockIdx.y * 16, out_corner_j = blockIdx.x * 16;
int x = threadIdx.x, y = threadIdx.y;
__shared__ float tile[16][16];
// coalesced read from global mem, TRANSPOSED write into shared mem:
tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
__syncthreads();
// read from shared mem, coalesced write to global mem:
out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
}
- 第六步:16+1操作增加共享内存空间,也能提高速度
// to be launched with one thread per element, in KxK threadblocks
// thread blocks read & write tiles, in coalesced fashion
// shared memory array padded to avoid bank conflicts
__global__ void
transpose_parallel_per_element_tiled_padded16(float in[], float out[])
{
// (i,j) locations of the tile corners for input & output matrices:
int in_corner_i = blockIdx.x * 16, in_corner_j = blockIdx.y * 16;
int out_corner_i = blockIdx.y * 16, out_corner_j = blockIdx.x * 16;
int x = threadIdx.x, y = threadIdx.y;
__shared__ float tile[16][16+1];
// coalesced read from global mem, TRANSPOSED write into shared mem:
tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
__syncthreads();
// read from shared mem, coalesced write to global mem:
out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
}