OpenCL 优化后的卷积代码

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/chao56789/article/details/50478497

自己修改过的 可以直接运行 卷积模板 

void prefetch_texture_samples_8x4(image2d_t src, sampler_t s, __local float4 rgb [10][6], int2 gid, int2 lid)
{
  if (lid.x == 0) {
    // work-item 1 fetches all 60 rgb samples
    for (int i=-1; i<9; i++) {
      for (int j=-1; j<5; j++)
        rgb[i+1][j+1] = read_imagef(src, s, gid+(int2)(i, j));
    }
  }
  barrier(CLK_LOCAL_MEM_FENCE);
}

void prefetch_8x4_optimized(image2d_t src, sampler_t s, __local float4 rgb[10][6])
{
  // Coord of wi0 in NRDange
  int2 wi0Coord = (int2)(get_group_id(0)*8, get_group_id(1)*4); 

  // 2D to 1D address (from 8x4 to 32x1)
  int flatLocal = get_local_id(1)*8 + get_local_id(0); 

  // Only first 30 work-items load, each loads 2 values in sequence
  if (flatLocal < 30) 
  {
    /* Convert from flatLocal 1D id to 2D, 10x3 */
    int i = flatLocal % 10; // Width
    int j = flatLocal / 10; // Height
    
    /* 30 work iteams reads 10x3 values, 
     * values 0-9, 10-19, 20-29 from 10x6 - top half 
     */
    rgb[j][i] = read_imagef(src, s, (int2)(wi0Coord.x + i - 1, wi0Coord.y + j - 1));
    
    /* 30 work iteams reads 10x3 values, 
     * values 30-39, 40-49, 50-59 from 10x6 - bottom half 
     */
    rgb[j + 3][i] = read_imagef(src, s, (int2)(wi0Coord.x + i - 1, wi0Coord.y + j + 3 - 1));
  }
  barrier(CLK_LOCAL_MEM_FENCE);
}

__attribute__((reqd_work_group_size(8, 4, 1)))
__kernel void blur (image2d_t src, image2d_t dst, sampler_t s, float *weight)
{
  int2 gid = (int2)(get_group_id(0)*8, get_group_id(1)*4);
  int2 lid = (int2)(get_local_id(0),   get_local_id(1));
  float4 pixel = 0.0f;

  __local float4 rgb[10][6];
  prefetch_texture_samples_8x4(src, s, rgb, gid, lid);

  for (int j=-1; j<=1; j++) 
    for (int i=-1; i<=1; i++)
      pixel += rgb[lid.x+1+i][lid.y+1+i] * weight[(j+1)*3+i+1];
  
  int x = get_global_id(0);
  int y = get_global_id(1);  
  write_imagef(dst, (int2)(x, y), pixel/9.f);
}


阅读更多
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页