- 删除loop-carrried依赖
- __kernel void unoptimized(__global int * restrict A,
- __global int * restrict B,
- __global int * restrict result)
- {
- int sum = 0;
- for(unsigned i=0;i<N;i++){
- for(unsigned j=0;j<N;j++){
- sum +=A[i*N+j];
- }
- sum += B[i];
- }
- *result = sum;
- }
使用局部变量能够解除依赖。
- __kernel void optimized(__global int * restrict A,
- __global int * restrict B,
- __global int * restrict result)
- {
- int sum = 0;
- for(unsigned i=0;i<N;i++){
- int sum2 = 0;
- for(unsigned j=0;j<N;j++){
- sum2 +=A[i*N+j];
- }
- sum += sum2;
- sum += B[i];
- }
- *result = sum;
- }
6.2
- #define N 128
- __kernel void unoptimized(__global float * restrict A,
- __global float * restrict result)
- {
- float mul = 1.0f;
- for(unsigned i=0;i<N;i++)
- mul *= A[i];
- *result = mul;
- }
原因在于在未进行优化之前float类型的乘法的II为3,进行优化之后II为1.思想是不使用单个变量来存储乘法结果,而是对变量的M个副本进行操作。相当于将乘法得到的数据存储到长度为M的数组中,并对数组里的数据进行移位赋值,这样的话长为M的数组中就各自存储了一部分的乘法数据,最后将这些数据进行相乘即为最终结果。
- #define N 128
- #define M 8
- __kernel void optimized(__global float * restrict A,
- __global float * restrict result)
- {
- float mul = 1.0f;
- float mul_copies[M];
- for(unsigned i = 0;i < M;i++)
- mul_copies[i] = 1.0f;
- for(unsigned i=0;i<N;i++){
- float cur = mul_copies[M-1] * A[i];
- #pragma unroll
- for(unsigned j = M-1;j >0;j--){
- mul_copies[j] = mul_copies[j-1];
- mul_copies[0] = cur;
- }
- }
- #pragma unroll
- for(unsigned i =0;i < M;i++)
- mul *= mul_copies[i];
- *result = mul;
- }
对于无法删除的循环依赖,通过将循环携带依赖项的数组从全局内存移动到本地内存来改进II
- #define N 128
- __kernel void unoptimized(__global float * restrict A)
- {
- for(unsigned i =0;i< N;i++){
- A[N-i] = A[i];
- }
- }
- #define N 128
- __kernel void optimized(__global float * restrict A)
- {
- float B[N];
- for(unsigned i =0;i< N;i++){
- B[i] = A[i];
- }
- for(unsigned i =0;i< N;i++){
- B[N-i] = B[i];
- }
- for(unsigned i =0;i< N;i++){
- A[i] = B[i];
- }
- }