自己修改过的 可以直接运行 卷积模板
void prefetch_texture_samples_8x4(image2d_t src, sampler_t s, __local float4 rgb [10][6], int2 gid, int2 lid)
{
if (lid.x == 0) {
// work-item 1 fetches all 60 rgb samples
for (int i=-1; i<9; i++) {
for (int j=-1; j<5; j++)
rgb[i+1][j+1] = read_imagef(src, s, gid+(int2)(i, j));
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
void prefetch_8x4_optimized(image2d_t src, sampler_t s, __local float4 rgb[10][6])
{
// Coord of wi0 in NRDange
int2 wi0Coord = (int2)(get_group_id(0)*8, get_group_id(1)*4);
// 2D to 1D address (from 8x4 to 32x1)
int flatLocal = get_local_id(1)*8 + get_local_id(0);
// Only first 30 work-items load, each loads 2 values in sequence
if (flatLocal < 30)
{
/* Convert from flatLocal 1D id to 2D, 10x3 */
int i = flatLocal % 10; // Width
int j = flatLocal / 10; // Height