TNN conv2d_1x1 opencl计算流程

由于cache line size为64byte,即16个float数据。如何利用这个cache?

1、每次加载数据最好将整个cache line上的数据全部加载进来(64 byte)

2、不同thread访问数据时,最好地址也是连续的

按照这种目的思考一下如何排列数据?

首先说一下TNN feature map的数据存储格式:(b*h) * (w4* c /4),dim0如下表,dim1为b*h,这点与mace相同。

w0w1...wnw0w1...
c0 c1 c2 c3c0 c1 c2 c3...c0 c1 c2 c3c4 c5 c6 c7c4 c5 c6 c7...

weight的存储格式:(c_out/4)*(c_in * 4)

c_in_0c_in_1...c_in_nc_in_0c_in_1...
c0 c1 c2 c3c0 c1 c2 c3...c0 c1 c2 c3c4 c5 c6 c7c4 c5 c6 c7...

理解的kernel:

__kernel void kernel_name(__read_only image2d_t input_image,
	const int src_channel,
	__write_only image2d_t out_image,
	const int dst_width,
	__read_only image2d_t weight_image,
	__global float *bias_ptr)
{
	const int wc = get_global_id(0);
	const int h = get_global_id(1);

	int output_w_align = (dst_width + 3) >> 2;
	const int c = wc / output_w_align;
	const int w = (wc % output_w_align) << 2;

	output_w_align = output_w_align << 2;
	const int out_c = c << 2;
	float4 v_bias = vload4(0, bias_ptr + out_c);
	float4 v_dst_0 = (float4)(v_bias);
	float4 v_dst_1 = v_dst_0;
	float4 v_dst_2 = v_dst_0;
	float4 v_dst_3 = v_dst_0;

	int src_c_aligned = (src_channel + 3) >> 2;
	
	//w *= 4;
	for (int i = 0; i < src_c_aligned; ++i)
	{
		int idx = i * 4 * output_w_align + w * 4;
		float4 v_weight0 = read_imagef(weight_image, sampler, (int2)((i << 2), c));
		float4 v_weight1 = read_imagef(weight_image, sampler, (int2)((i << 2) + 1, c));
		float4 v_weight2 = read_imagef(weight_image, sampler, (int2)((i << 2) + 2, c));
		float4 v_weight3 = read_imagef(weight_image, sampler, (int2)((i << 2) + 3, c));

		float4 v_src0 = read_imagef(input_image, sampler, (int2)((idx >> 2), h));
		float4 v_src1 = read_imagef(input_image, sampler, (int2)((idx >> 2) + 1, h));
		float4 v_src2 = read_imagef(input_image, sampler, (int2)((idx >> 2) + 2, h));
		float4 v_src3 = read_imagef(input_image, sampler, (int2)((idx >> 2) + 3, h));

		v_dst_0 += v_weight0 * v_src0.s0;
		v_dst_0 += v_weight1 * v_src0.s1;
		v_dst_0 += v_weight2 * v_src0.s2;
		v_dst_0 += v_weight3 * v_src0.s3;

		v_dst_1 += v_weight0 * v_src1.s0;
		v_dst_1 += v_weight1 * v_src1.s1;
		v_dst_1 += v_weight2 * v_src1.s2;
		v_dst_1 += v_weight3 * v_src1.s3;

		v_dst_2 += v_weight0 * v_src2.s0;
		v_dst_2 += v_weight1 * v_src2.s1;
		v_dst_2 += v_weight2 * v_src2.s2;
		v_dst_2 += v_weight3 * v_src2.s3;

		v_dst_3 += v_weight0 * v_src3.s0;
		v_dst_3 += v_weight1 * v_src3.s1;
		v_dst_3 += v_weight2 * v_src3.s2;
		v_dst_3 += v_weight3 * v_src3.s3;
		
	}

	int out_idx = c * output_w_align * 4 + w * 4;
	write_imagef(out_image, (int2)((out_idx >> 2), h), v_dst_0);
	write_imagef(out_image, (int2)((out_idx >> 2) + 1, h), v_dst_1);
	write_imagef(out_image, (int2)((out_idx >> 2) + 2, h), v_dst_2);
	write_imagef(out_image, (int2)((out_idx >> 2) + 3, h), v_dst_3);
}

其中input_image.height = input_height

input_image.width = (input_channel + 3) / 4 * 4 * ((input_width + 3) / 4 * 4);

out_image结构与input_image类似

filter.height = (out_channel + 3) / 4;

filter.width = input_channel * 4;

 

input格式转换nchw->目标格式

        INT32 dst_row = (src_channel + 3) / 4 * 4;
	INT32 src_width_align = (src_width + 3) / 4 * 4;
	float*dst_input = (float*)calloc(src_height * src_width_align * dst_row, sizeof(float));
	for (int h = 0; h < src_height; h++)
	{
		for (int w = 0; w < src_width; w++)
		{
			for (int c = 0; c < src_channel; c++)
			{
				int dst_idx = (c / 4) * src_width_align * 4 + w * 4 + c % 4;
				dst_input[h * src_width_align * dst_row + dst_idx] =
					src_input[c * src_height * src_width + h * src_width + w];
			}
		}
	}

filter->目标格式

        dst_row = (out_channel + 3) / 4;
	float *dst_weight = (float*)calloc(src_channel * 4 * dst_row,sizeof(float));
	for (int c_out = 0; c_out < out_channel; c_out++)
	{
		for (int c_in = 0; c_in < src_channel; c_in++)
		{
			dst_weight[c_out / 4 * src_channel * 4 + c_in * 4 + c_out % 4] =
			src_weight[c_out * src_channel + c_in];
		}
	}

output目标格式转为nchw:

        dst_row = (out_channel + 3) / 4 * 4;
	src_width_align = (out_width + 3) / 4 * 4;
	float*dst_output = (float*)calloc(out_height * src_width_align * dst_row, sizeof(float));
	for (int h = 0; h < out_height; h++)
	{
		for (int w = 0; w < out_width; w++)
		{
			for (int c = 0; c < out_channel; c++)
			{
				int dst_idx = (c / 4) * src_width_align * 4 + w * 4 + c % 4;
				output[c * src_height * src_width + h * src_width + w]
					= dst_output[h * src_width_align * dst_row + dst_idx];
			}
		}
	}

TNN这种排列方式,可以看出kernel里面的for循环,input_image每次循环的跨幅很大,感觉cache不是很友好。于是我就把feature map重新排了一下看看:

w0w1w2w3w0w1w2w3...
c0 c1 c2 c3c0 c1 c2 c3c0 c1 c2 c3c0 c1 c2 c3c4 c5  c6  c7c4 c5 c6 c7c4 c5 c6 c7c4 c5 c6 c7

...

 

这样排列的话,感觉同一个线程每次for循环cache好一点。

对应的kernel:

__kernel void kernel_name(__read_only image2d_t input_image,
		const int src_channel,
		__write_only image2d_t output_image,
		const int dst_width,
		const int dst_channel,
		__global float *weight,
		const int weight_width,
		__global float *bias)
{
	const int wc = get_global_id(0);
	const int h = get_global_id(1);

	const int input_c_align = (src_channel + 3) / 4 * 4;
	const int output_c_align = (dst_channel + 3) / 4 * 4;

	int output_w_align = (dst_width + 3) >> 2;
	const int c = wc / output_w_align;
	const int w = (wc % output_w_align);

	output_w_align = output_w_align << 2;
	const int out_c = c << 2;
	const int out_w = w << 2;

	float4 v_bias = vload4(0, bias + out_c);
	float4 v_dst_0 = (float4)(v_bias);
	float4 v_dst_1 = v_dst_0;
	float4 v_dst_2 = v_dst_0;
	float4 v_dst_3 = v_dst_0;

	int src_c_aligned = (src_channel + 3) >> 2;

	__global float* weight_tmp = weight + c * weight_width;
	int idx = w * input_c_align;
	for (int i = 0; i < src_c_aligned; ++i)
	{
		float4 v_weight0 = vload4(0, weight_tmp + i * 4 * 4);
		float4 v_weight1 = vload4(0, weight_tmp + i * 4 * 4 + 4);
		float4 v_weight2 = vload4(0, weight_tmp + i * 4 * 4 + 8);
		float4 v_weight3 = vload4(0, weight_tmp + i * 4 * 4 + 12);

		float4 v_src0 = read_imagef(input_image, sampler, (int2)(idx, h));
		float4 v_src1 = read_imagef(input_image, sampler, (int2)(idx + 1, h));
		float4 v_src2 = read_imagef(input_image, sampler, (int2)(idx + 2, h));
		float4 v_src3 = read_imagef(input_image, sampler, (int2)(idx + 3, h));

		v_dst_0 = mad(v_weight0, v_src0.s0, v_dst_0);
		v_dst_0 = mad(v_weight1, v_src0.s1, v_dst_0);
		v_dst_0 = mad(v_weight2, v_src0.s2, v_dst_0);
		v_dst_0 = mad(v_weight3, v_src0.s3, v_dst_0);

		v_dst_1 = mad(v_weight0, v_src1.s0, v_dst_1);
		v_dst_1 = mad(v_weight1, v_src1.s1, v_dst_1);
		v_dst_1 = mad(v_weight2, v_src1.s2, v_dst_1);
		v_dst_1 = mad(v_weight3, v_src1.s3, v_dst_1);

		v_dst_2 = mad(v_weight0, v_src2.s0, v_dst_2);
		v_dst_2 = mad(v_weight1, v_src2.s1, v_dst_2);
		v_dst_2 = mad(v_weight2, v_src2.s2, v_dst_2);
		v_dst_2 = mad(v_weight3, v_src2.s3, v_dst_2);

		v_dst_3 = mad(v_weight0, v_src3.s0, v_dst_3);
		v_dst_3 = mad(v_weight1, v_src3.s1, v_dst_3);
		v_dst_3 = mad(v_weight2, v_src3.s2, v_dst_3);
		v_dst_3 = mad(v_weight3, v_src3.s3, v_dst_3);

		idx += 4;
	}
	int out_idx = w * output_c_align + c * 4;
	write_imagef(output_image, (int2)(out_idx, h), v_dst_0);
	write_imagef(output_image, (int2)(out_idx + 1, h), v_dst_1);
	write_imagef(output_image, (int2)(out_idx + 2, h), v_dst_2);
	write_imagef(output_image, (int2)(out_idx + 3, h), v_dst_3);
}

 input格式转换:

        INT32 dst_row = (src_channel + 3) / 4 * 4;
	INT32 src_width_align = (src_width + 3) / 4 * 4;
	float*dst_input = (float*)calloc(src_height * src_width_align * dst_row, sizeof(float));
	for (int h = 0; h < src_height; h++)
	{
		for (int w = 0; w < src_width; w++)
		{
			for (int c = 0; c < src_channel; c++)
			{
				int dst_idx = (w / 4) * dst_row * 4 + c / 4 * 16 + (w % 4) * 4 + c % 4;
				dst_input[h * src_width_align * dst_row + dst_idx] =
					src_input[c * src_height * src_width + h * src_width + w];
			}
		}
	}

两种kernel性能分析:

1、当输入width较小时,两个kernel性能基本差不多。这就让人感觉for循环内部的input_image连续基本没什么用。可能是cache较少,这部分刚存完weight,后面又得缓存input_image。

2、当输入width较大时,第二个kernel性能明显降低。当并行处理时,多个线程访问的数据连续性较差,比如说第一个线程做w0/w1/w2/w3这四个数据时,第二个线程做w4/w5/w6/w7时,两个线程访问的数据跨幅很大,不方便合并访问。

也就是说排列数据,最好能做到同一个线程能够有效利用一个cache line,不同线程之间访问的数据最好连续。

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值