OpenCL 增强单work-item kernel性能策略

weixin_45121123

于 2020-08-03 16:42:36 发布

阅读量127

点赞数

原文链接：https://www.cnblogs.com/zhuzhudong/p/13403737.html

版权

根据优化报告的反馈，可以通过实现更简单的内存访问的方式来消除循环携带的依赖关系。

优化前：

复制代码
1 #define N 128
2
3 __kernel void unoptimized (__global int * restrict A,
4 __global int * restrict B,
5 __global int* restrict result)
6 {
7 int sum = 0;
8
9 for (unsigned i = 0; i < N; i++) {
10 for (unsigned j = 0; j < N; j++) {
11 sum += A[i*N+j];
12 }
13 sum += B[i];
14 }
15
16 * result = sum;
17 }
复制代码
优化后：

复制代码
1 #define N 128
2
3 __kernel void optimized (__global int * restrict A,
4 __global int * restrict B,
5 __global int * restrict result)
6 {
7 int sum = 0;
8
9 for (unsigned i = 0; i < N; i++) {
10 // Step 1: Definition
11 int sum2 = 0;
12
13 // Step 2: Accumulation of array A values for one outer loop iteration
14 for (unsigned j = 0; j < N; j++) {
15 sum2 += A[i*N+j];
16 }
17
18 // Step 3: Addition of array B value for an outer loop iteration
19 sum += sum2;
20 sum += B[i];
21 }
22
23 * result = sum;
24 }
复制代码
(2) Relaxing Loop-Carried Dependency

根据优化报告的反馈，可以通过增加依赖距离的方式来relax缓解循环携带的依赖关系。

优化前：

复制代码
1 #define N 128
2
3 __kernel void unoptimized (__global float * restrict A,
4 __global float * restrict result)
5 {
6 float mul = 1.0f;
7
8 for (unsigned i = 0; i < N; i++)
9 mul *= A[i];
10
11 * result = mul;
12 }
复制代码
优化后：

复制代码
1 #define N 128
2 #define M 8
3
4 __kernel void optimized (__global float * restrict A,
5 __global float * restrict result)
6 {
7 float mul = 1.0f;
8
9 // Step 1: Declare multiple copies of variable mul
10 float mul_copies[M];
11
12 // Step 2: Initialize all copies
13 for (unsigned i = 0; i < M; i++)
14 mul_copies[i] = 1.0f;
15
16 for (unsigned i = 0; i < N; i++) {
17 // Step 3: Perform multiplication on the last copy
18 float cur = mul_copies[M-1] * A[i];
19
20 // Step 4a: Shift copies
21 #pragma unroll
22 for (unsigned j = M-1; j > 0; j–)
23 mul_copies[j] = mul_copies[j-1];
24
25 // Step 4b: Insert updated copy at the beginning
26 mul_copies[0] = cur;
27 }
28
29 // Step 5: Perform reduction on copies
30 #pragma unroll
31 for (unsigned i = 0; i < M; i++)
32 mul *= mul_copies[i];
33
34 * result = mul;
35 }
复制代码
(3) Transferring Loop-Carried Dependency to Local Memory

对于不能消除的循环携带的依赖关系，可以通过将具有循环依赖的数组从全局内存global memory移动到local memory的方式来改进循环的启动间隔（initiation interval, II）。

优化前：

复制代码
1 #define N 128
2
3 __kernel void unoptimized( __global int* restrict A )
4 {
5 for (unsigned i = 0; i < N; i++)
6 A[N-i] = A[i];
7 }
复制代码
优化后：

复制代码
1 #define N 128
2
3 __kernel void optimized( __global int* restrict A )
4 {
5 int B[N];
6
7 for (unsigned i = 0; i < N; i++)
8 B[i] = A[i];
9
10 for (unsigned i = 0; i < N; i++)
11 B[N-i] = B[i];
12
13 for (unsigned i = 0; i < N; i++)
14 A[i] = B[i];
15 }
复制代码
(4) Relaxing Loop-Carried Dependency by Inferring Shift Registers

为了使SDK能够有效地处理执行双精度浮点运算的单个work-item kernels，可以通过推断移位寄存器的方式移除循环携带的依赖性。

优化前：

复制代码
1 __kernel void double_add_1 (__global double *arr,
2 int N,
3 __global double *result)
4 {
5 double temp_sum = 0;
6
7 for (int i = 0; i < N; ++i)
8 {
9 temp_sum += arr[i];
10 }
11
12 *result = temp_sum;
13 }
复制代码
优化后：

复制代码
1 //Shift register size must be statically determinable
2 #define II_CYCLES 12
3
4 __kernel void double_add_2 (__global double *arr,
5 int N,
6 __global double *result)
7 {
8 //Create shift register with II_CYCLE+1 elements
9 double shift_reg[II_CYCLES+1];
10
11 //Initialize all elements of the register to 0
12 for (int i = 0; i < II_CYCLES + 1; i++)
13 {
14 shift_reg[i] = 0;
15 }
16
17 //Iterate through every element of input array
18 for(int i = 0; i < N; ++i)
19 {
20 //Load ith element into end of shift register
21 //if N > II_CYCLE, add to shift_reg[0] to preserve values
22 shift_reg[II_CYCLES] = shift_reg[0] + arr[i];
23
24 #pragma unroll
25 //Shift every element of shift register
26 for(int j = 0; j < II_CYCLES; ++j)
27 {
28 shift_reg[j] = shift_reg[j + 1];
29 }
30 }
31
32 //Sum every element of shift register
33 double temp_sum = 0;
34
35 #pragma unroll
36 for(int i = 0; i < II_CYCLES; ++i)
37 {
38 temp_sum += shift_reg[i];
39 }
40
41 *result = temp_sum;
42 }
复制代码
(5) Removing Loop-Carried Dependencies Cause by Accesses to Memory Arrays

在单个work-item的kernel中包含ivdep pragma以保证堆内存array的访问不会导致循环携带的依赖性。

如果对循环中的内存阵列的所有访问都不会导致循环携带的依赖性，在内核代码中的循环之前添加#pragma ivdep。

复制代码
// no loop-carried dependencies for A and B array accesses
#pragma ivdep
for (int i = 0; i < N; i++) {
A[i] = A[i - X[i]];
B[i] = B[i - Y[i]];
}
复制代码
　　要指定对循环内特定内存数组的访问不会导致循环携带的依赖关系，在内核代码中的循环之前添加#pragma ivdep arrayarray_name）。

ivdep pragma指定的数组必须是local / private内存数组，或者是指向global/local/private内存存储的指针变量。如果指定的数组是指针，则ivdep pragma也适用于所有可能带有指定指针别名的数组。

ivdep pragma指示指定的数组也可以是struct的数组或指针成员。

复制代码
// No loop-carried dependencies for A array accesses
// The offline compiler will insert hardware that reinforces dependency constraints for B
#pragma ivdep array(A)
for (int i = 0; i < N; i++) {
A[i] = A[i - X[i]];
B[i] = B[i - Y[i]];
}

// No loop-carried dependencies for array A inside struct
#pragma ivdep array(S.A)
for (int i = 0; i < N; i++) {
S.A[i] = S.A[i - X[i]];
}

// No loop-carried dependencies for array A inside the struct pointed by S
#pragma ivdep array(S->X[2][3].A)
for (int i = 0; i < N; i++) {
S->X[2][3].A[i] = S.A[i - X[i]];
}

// No loop-carried dependencies for A and B because ptr aliases
// with both arrays
int *ptr = select ? A : B;
#pragma ivdep array(ptr)
for (int i = 0; i < N; i++) {
A[i] = A[i - X[i]];
B[i] = B[i - Y[i]];
}

// No loop-carried dependencies for A because ptr only aliases with A
int *ptr = &A[10];
#pragma ivdep array(ptr)
for (int i = 0; i < N; i++) {
A[i] = A[i - X[i]];
B[i] = B[i - Y[i]];
}
深圳网站建设www.sz886.com

weixin_45121123

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
OpenCL 增强单work-item kernel性能策略

根据优化报告的反馈，可以通过实现更简单的内存访问的方式来消除循环携带的依赖关系。优化前：复制代码1 #define N 12823 __kernel void unoptimized (__global int * restrict A,4 __global int * restrict B,5 __global int* restrict result)6 {7 int sum
复制链接

扫一扫