由于cache line size为64byte,即16个float数据。如何利用这个cache?
1、每次加载数据最好将整个cache line上的数据全部加载进来(64 byte)
2、不同thread访问数据时,最好地址也是连续的
按照这种目的思考一下如何排列数据?
首先说一下TNN feature map的数据存储格式:(b*h) * (w4* c /4),dim0如下表,dim1为b*h,这点与mace相同。
w0 | w1 | ... | wn | w0 | w1 | ... |
c0 c1 c2 c3 | c0 c1 c2 c3 | ... | c0 c1 c2 c3 | c4 c5 c6 c7 | c4 c5 c6 c7 | ... |
weight的存储格式:(c_out/4)*(c_in * 4)
c_in_0 | c_in_1 | ... | c_in_n | c_in_0 | c_in_1 | ... |
c0 c1 c2 c3 | c0 c1 c2 c3 | ... | c0 c1 c2 c3 | c4 c5 c6 c7 | c4 c5 c6 c7 | ... |
理解的kernel:
__kernel void kernel_name(__read_only image2d_t input_image,
const int src_channel,
__write_only image2d_t out_image,
const int dst_width,
__read_only image2d_t weight_image,
__global float *bias_ptr)
{
const int wc = get_global_id(0);
const int h = get_global_id(1);
int output_w_align = (dst_width + 3) >> 2;
const int c = wc / output_w_align;
const int w = (wc % output_w_align) << 2;
output_w_align = output_w_align << 2;
const int out_c = c << 2;
float4 v_bias = vload4(0, bias_ptr + out_c);
float4 v_dst_0 = (float4)(v_bias);
float4 v_dst_1 = v_dst_0;
float4 v_dst_2 = v_dst_0;
float4 v_dst_3 = v_dst_0;
int src_c_aligned = (src_channel + 3) >> 2;
//w *= 4;
for (int i = 0; i < src_c_aligned; ++i)
{
int idx = i * 4 * output_w_align + w * 4;
float4 v_weight0 = read_imagef(weight_image, sampler, (int2)((i << 2), c));
float4 v_weight1 = read_imagef(weight_image, sampler, (int2)((i << 2) + 1, c));
float4 v_weight2 = read_imagef(weight_image, sampler, (int2)((i << 2) + 2, c));
float4 v_weight3 = read_imagef(weight_image, sampler, (int2)((i << 2) + 3, c));
float4 v_src0 = read_imagef(input_image, sampler, (int2)((idx >> 2), h));
float4 v_src1 = read_imagef(input_image, sampler, (int2)((idx >> 2) + 1, h));
float4 v_src2 = read_imagef(input_image, sampler, (int2)((idx >> 2) + 2, h));
float4 v_src3 = read_imagef(input_image, sampler, (int2)((idx >> 2) + 3, h));
v_dst_0 += v_weight0 * v_src0.s0;
v_dst_0 += v_weight1 * v_src0.s1;
v_dst_0 += v_weight2 * v_src0.s2;
v_dst_0 += v_weight3 * v_src0.s3;
v_dst_1 += v_weight0 * v_src1.s0;
v_dst_1 += v_weight1 * v_src1.s1;
v_dst_1 += v_weight2 * v_src1.s2;
v_dst_1 += v_weight3 * v_src1.s3;
v_dst_2 += v_weight0 * v_src2.s0;
v_dst_2 += v_weight1 * v_src2.s1;
v_dst_2 += v_weight2 * v_src2.s2;
v_dst_2 += v_weight3 * v_src2.s3;
v_dst_3 += v_weight0 * v_src3.s0;
v_dst_3 += v_weight1 * v_src3.s1;
v_dst_3 += v_weight2 * v_src3.s2;
v_dst_3 += v_weight3 * v_src3.s3;
}
int out_idx = c * output_w_align * 4 + w * 4;
write_imagef(out_image, (int2)((out_idx >> 2), h), v_dst_0);
write_imagef(out_image, (int2)((out_idx >> 2) + 1, h), v_dst_1);
write_imagef(out_image, (int2)((out_idx >> 2) + 2, h), v_dst_2);
write_imagef(out_image, (int2)((out_idx >> 2) + 3, h), v_dst_3);
}
其中input_image.height = input_height
input_image.width = (input_channel + 3) / 4 * 4 * ((input_width + 3) / 4 * 4);
out_image结构与input_image类似
filter.height = (out_channel + 3) / 4;
filter.width = input_channel * 4;
input格式转换nchw->目标格式
INT32 dst_row = (src_channel + 3) / 4 * 4;
INT32 src_width_align = (src_width + 3) / 4 * 4;
float*dst_input = (float*)calloc(src_height * src_width_align * dst_row, sizeof(float));
for (int h = 0; h < src_height; h++)
{
for (int w = 0; w < src_width; w++)
{
for (int c = 0; c < src_channel; c++)
{
int dst_idx = (c / 4) * src_width_align * 4 + w * 4 + c % 4;
dst_input[h * src_width_align * dst_row + dst_idx] =
src_input[c * src_height * src_width + h * src_width + w];
}
}
}
filter->目标格式
dst_row = (out_channel + 3) / 4;
float *dst_weight = (float*)calloc(src_channel * 4 * dst_row,sizeof(float));
for (int c_out = 0; c_out < out_channel; c_out++)
{
for (int c_in = 0; c_in < src_channel; c_in++)
{
dst_weight[c_out / 4 * src_channel * 4 + c_in * 4 + c_out % 4] =
src_weight[c_out * src_channel + c_in];
}
}
output目标格式转为nchw:
dst_row = (out_channel + 3) / 4 * 4;
src_width_align = (out_width + 3) / 4 * 4;
float*dst_output = (float*)calloc(out_height * src_width_align * dst_row, sizeof(float));
for (int h = 0; h < out_height; h++)
{
for (int w = 0; w < out_width; w++)
{
for (int c = 0; c < out_channel; c++)
{
int dst_idx = (c / 4) * src_width_align * 4 + w * 4 + c % 4;
output[c * src_height * src_width + h * src_width + w]
= dst_output[h * src_width_align * dst_row + dst_idx];
}
}
}
TNN这种排列方式,可以看出kernel里面的for循环,input_image每次循环的跨幅很大,感觉cache不是很友好。于是我就把feature map重新排了一下看看:
w0 | w1 | w2 | w3 | w0 | w1 | w2 | w3 | ... |
c0 c1 c2 c3 | c0 c1 c2 c3 | c0 c1 c2 c3 | c0 c1 c2 c3 | c4 c5 c6 c7 | c4 c5 c6 c7 | c4 c5 c6 c7 | c4 c5 c6 c7 | ...
|
这样排列的话,感觉同一个线程每次for循环cache好一点。
对应的kernel:
__kernel void kernel_name(__read_only image2d_t input_image,
const int src_channel,
__write_only image2d_t output_image,
const int dst_width,
const int dst_channel,
__global float *weight,
const int weight_width,
__global float *bias)
{
const int wc = get_global_id(0);
const int h = get_global_id(1);
const int input_c_align = (src_channel + 3) / 4 * 4;
const int output_c_align = (dst_channel + 3) / 4 * 4;
int output_w_align = (dst_width + 3) >> 2;
const int c = wc / output_w_align;
const int w = (wc % output_w_align);
output_w_align = output_w_align << 2;
const int out_c = c << 2;
const int out_w = w << 2;
float4 v_bias = vload4(0, bias + out_c);
float4 v_dst_0 = (float4)(v_bias);
float4 v_dst_1 = v_dst_0;
float4 v_dst_2 = v_dst_0;
float4 v_dst_3 = v_dst_0;
int src_c_aligned = (src_channel + 3) >> 2;
__global float* weight_tmp = weight + c * weight_width;
int idx = w * input_c_align;
for (int i = 0; i < src_c_aligned; ++i)
{
float4 v_weight0 = vload4(0, weight_tmp + i * 4 * 4);
float4 v_weight1 = vload4(0, weight_tmp + i * 4 * 4 + 4);
float4 v_weight2 = vload4(0, weight_tmp + i * 4 * 4 + 8);
float4 v_weight3 = vload4(0, weight_tmp + i * 4 * 4 + 12);
float4 v_src0 = read_imagef(input_image, sampler, (int2)(idx, h));
float4 v_src1 = read_imagef(input_image, sampler, (int2)(idx + 1, h));
float4 v_src2 = read_imagef(input_image, sampler, (int2)(idx + 2, h));
float4 v_src3 = read_imagef(input_image, sampler, (int2)(idx + 3, h));
v_dst_0 = mad(v_weight0, v_src0.s0, v_dst_0);
v_dst_0 = mad(v_weight1, v_src0.s1, v_dst_0);
v_dst_0 = mad(v_weight2, v_src0.s2, v_dst_0);
v_dst_0 = mad(v_weight3, v_src0.s3, v_dst_0);
v_dst_1 = mad(v_weight0, v_src1.s0, v_dst_1);
v_dst_1 = mad(v_weight1, v_src1.s1, v_dst_1);
v_dst_1 = mad(v_weight2, v_src1.s2, v_dst_1);
v_dst_1 = mad(v_weight3, v_src1.s3, v_dst_1);
v_dst_2 = mad(v_weight0, v_src2.s0, v_dst_2);
v_dst_2 = mad(v_weight1, v_src2.s1, v_dst_2);
v_dst_2 = mad(v_weight2, v_src2.s2, v_dst_2);
v_dst_2 = mad(v_weight3, v_src2.s3, v_dst_2);
v_dst_3 = mad(v_weight0, v_src3.s0, v_dst_3);
v_dst_3 = mad(v_weight1, v_src3.s1, v_dst_3);
v_dst_3 = mad(v_weight2, v_src3.s2, v_dst_3);
v_dst_3 = mad(v_weight3, v_src3.s3, v_dst_3);
idx += 4;
}
int out_idx = w * output_c_align + c * 4;
write_imagef(output_image, (int2)(out_idx, h), v_dst_0);
write_imagef(output_image, (int2)(out_idx + 1, h), v_dst_1);
write_imagef(output_image, (int2)(out_idx + 2, h), v_dst_2);
write_imagef(output_image, (int2)(out_idx + 3, h), v_dst_3);
}
input格式转换:
INT32 dst_row = (src_channel + 3) / 4 * 4;
INT32 src_width_align = (src_width + 3) / 4 * 4;
float*dst_input = (float*)calloc(src_height * src_width_align * dst_row, sizeof(float));
for (int h = 0; h < src_height; h++)
{
for (int w = 0; w < src_width; w++)
{
for (int c = 0; c < src_channel; c++)
{
int dst_idx = (w / 4) * dst_row * 4 + c / 4 * 16 + (w % 4) * 4 + c % 4;
dst_input[h * src_width_align * dst_row + dst_idx] =
src_input[c * src_height * src_width + h * src_width + w];
}
}
}
两种kernel性能分析:
1、当输入width较小时,两个kernel性能基本差不多。这就让人感觉for循环内部的input_image连续基本没什么用。可能是cache较少,这部分刚存完weight,后面又得缓存input_image。
2、当输入width较大时,第二个kernel性能明显降低。当并行处理时,多个线程访问的数据连续性较差,比如说第一个线程做w0/w1/w2/w3这四个数据时,第二个线程做w4/w5/w6/w7时,两个线程访问的数据跨幅很大,不方便合并访问。
也就是说排列数据,最好能做到同一个线程能够有效利用一个cache line,不同线程之间访问的数据最好连续。