优化GPU程序策略

最新推荐文章于 2024-07-15 16:24:51 发布

mscheng123

最新推荐文章于 2024-07-15 16:24:51 发布

阅读量270

点赞数

分类专栏： CUDA编程文章标签： cuda gpu

本文链接：https://blog.csdn.net/mscheng123/article/details/105484576

版权

CUDA编程专栏收录该内容

1 篇文章 0 订阅

订阅专栏

优化GPU原则与优化步骤

优化目标：

Solve bigger problems
Solve more problems

优化原则

最大化算术强度
减少内存操作花费的时间
合并全局内存访问
避免线程发散
把高频使用数据移到共享内存

\tfrac{Math}{Menmory}

优化等级

选择好的算法
基本的高效代码的法则
体系机构具体优化
指令级的操作微观优化

优化的流程与步骤

分析：
- 分析程序瓶颈、什么地方需要做并行、能够提供的资源
并行：
- Libraries：OpenMP(CPU),OpenACC
- Directive
- Pick an algorithm
优化：
- 测量内存、带宽和占用率等指标

APOD–分析

不要依赖直觉！
分析工具：
- gProf
- VTune
- VerySleepy

APOD–并行

以矩阵转置为例：
- 单个线程处理
- 矩阵每一行作为一个线程处理
将程序并行化提高优化为：
- 每个线程处理一个元素
进一步优化
- 将数据划分利用共享内存
进进一步优化，K=16（缩小）

六步优化代码

第一步：单个线程处理

// to be launched on a single thread
__global__ void 
transpose_serial(float in[], float out[])
{
	for(int j=0; j < N; j++)
		for(int i=0; i < N; i++)
			out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
}

第二步：线程并行处理一行

// to be launched with one thread per row of output matrix
__global__ void 
transpose_parallel_per_row(float in[], float out[])
{
	int i = threadIdx.x;

	for(int j=0; j < N; j++)
		out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
}

第三步：线程并行处理单个元素

// to be launched with one thread per element, in KxK threadblocks
// thread (x,y) in grid writes element (i,j) of output matrix 
__global__ void 
transpose_parallel_per_element(float in[], float out[])
{
	int i = blockIdx.x * K + threadIdx.x;
	int j = blockIdx.y * K + threadIdx.y;

	out[j + i*N] = in[i + j*N]; // out(j,i) = in(i,j)
}

第四步：使用共享内存

// to be launched with one thread per element, in (tilesize)x(tilesize) threadblocks
// thread blocks read & write tiles, in coalesced fashion
// adjacent threads read adjacent input elements, write adjacent output elmts
__global__ void 
transpose_parallel_per_element_tiled(float in[], float out[])
{
	// (i,j) locations of the tile corners for input & output matrices:
	int in_corner_i  = blockIdx.x * K, in_corner_j  = blockIdx.y * K;
	int out_corner_i = blockIdx.y * K, out_corner_j = blockIdx.x * K;

	int x = threadIdx.x, y = threadIdx.y;

	__shared__ float tile[K][K];

	// coalesced read from global mem, TRANSPOSED write into shared mem:
	tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
	__syncthreads();
	// read from shared mem, coalesced write to global mem:
	out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
}

第五步：数据分块增多

// to be launched with one thread per element, in (tilesize)x(tilesize) threadblocks
// thread blocks read & write tiles, in coalesced fashion
// adjacent threads read adjacent input elements, write adjacent output elmts
__global__ void 
transpose_parallel_per_element_tiled16(float in[], float out[])
{
	// (i,j) locations of the tile corners for input & output matrices:
	int in_corner_i  = blockIdx.x * 16, in_corner_j  = blockIdx.y * 16;
	int out_corner_i = blockIdx.y * 16, out_corner_j = blockIdx.x * 16;

	int x = threadIdx.x, y = threadIdx.y;

	__shared__ float tile[16][16];

	// coalesced read from global mem, TRANSPOSED write into shared mem:
	tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
	__syncthreads();
	// read from shared mem, coalesced write to global mem:
	out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
}

第六步：16+1操作增加共享内存空间，也能提高速度

// to be launched with one thread per element, in KxK threadblocks
// thread blocks read & write tiles, in coalesced fashion
// shared memory array padded to avoid bank conflicts
__global__ void 
transpose_parallel_per_element_tiled_padded16(float in[], float out[])
{
	// (i,j) locations of the tile corners for input & output matrices:
	int in_corner_i  = blockIdx.x * 16, in_corner_j  = blockIdx.y * 16;
	int out_corner_i = blockIdx.y * 16, out_corner_j = blockIdx.x * 16;

	int x = threadIdx.x, y = threadIdx.y;

	__shared__ float tile[16][16+1];

	// coalesced read from global mem, TRANSPOSED write into shared mem:
	tile[y][x] = in[(in_corner_i + x) + (in_corner_j + y)*N];
	__syncthreads();
	// read from shared mem, coalesced write to global mem:
	out[(out_corner_i + x) + (out_corner_j + y)*N] = tile[x][y];
}

mscheng123

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
优化GPU程序策略

优化GPU原则与优化步骤优化目标：Solve bigger problemsSolve more problems优化原则最大化算术强度减少内存操作花费的时间合并全局内存访问避免线程发散把高频使用数据移到共享内存\tfrac{Math}{Menmory}优化等级选择好的算法基本的高效代码的法则体系机构具体优化指令级的操作微观优化优化的流程与步骤...
复制链接

扫一扫

专栏目录