TNN conv2d_1x1 opencl计算流程

最新推荐文章于 2023-03-03 08:00:00 发布

天才da熊猫

最新推荐文章于 2023-03-03 08:00:00 发布

阅读量610

点赞数 1

分类专栏： TNN 前向推理框架

本文链接：https://blog.csdn.net/azheng_wen/article/details/107204835

版权

前向推理框架同时被 2 个专栏收录

5 篇文章 0 订阅

订阅专栏

TNN

2 篇文章 0 订阅

订阅专栏

由于cache line size为64byte，即16个float数据。如何利用这个cache？

1、每次加载数据最好将整个cache line上的数据全部加载进来（64 byte）

2、不同thread访问数据时，最好地址也是连续的

按照这种目的思考一下如何排列数据？

首先说一下TNN feature map的数据存储格式：(b*h) * (w4* c /4),dim0如下表，dim1为b*h，这点与mace相同。

w0	w1	...	wn	w0	w1	...
c0 c1 c2 c3	c0 c1 c2 c3	...	c0 c1 c2 c3	c4 c5 c6 c7	c4 c5 c6 c7	...

weight的存储格式：(c_out/4)*(c_in * 4)

c_in_0	c_in_1	...	c_in_n	c_in_0	c_in_1	...
c0 c1 c2 c3	c0 c1 c2 c3	...	c0 c1 c2 c3	c4 c5 c6 c7	c4 c5 c6 c7	...

理解的kernel：

__kernel void kernel_name(__read_only image2d_t input_image,
	const int src_channel,
	__write_only image2d_t out_image,
	const int dst_width,
	__read_only image2d_t weight_image,
	__global float *bias_ptr)
{
	const int wc = get_global_id(0);
	const int h = get_global_id(1);

	int output_w_align = (dst_width + 3) >> 2;
	const int c = wc / output_w_align;
	const int w = (wc % output_w_align) << 2;

	output_w_align = output_w_align << 2;
	const int out_c = c << 2;
	float4 v_bias = vload4(0, bias_ptr + out_c);
	float4 v_dst_0 = (float4)(v_bias);
	float4 v_dst_1 = v_dst_0;
	float4 v_dst_2 = v_dst_0;
	float4 v_dst_3 = v_dst_0;

	int src_c_aligned = (src_channel + 3) >> 2;
	
	//w *= 4;
	for (int i = 0; i < src_c_aligned; ++i)
	{
		int idx = i * 4 * output_w_align + w * 4;
		float4 v_weight0 = read_imagef(weight_image, sampler, (int2)((i << 2), c));
		float4 v_weight1 = read_imagef(weight_image, sampler, (int2)((i << 2) + 1, c));
		float4 v_weight2 = read_imagef(weight_image, sampler, (int2)((i << 2) + 2, c));
		float4 v_weight3 = read_imagef(weight_image, sampler, (int2)((i << 2) + 3, c));

		float4 v_src0 = read_imagef(input_image, sampler, (int2)((idx >> 2), h));
		float4 v_src1 = read_imagef(input_image, sampler, (int2)((idx >> 2) + 1, h));
		float4 v_src2 = read_imagef(input_image, sampler, (int2)((idx >> 2) + 2, h));
		float4 v_src3 = read_imagef(input_image, sampler, (int2)((idx >> 2) + 3, h));

		v_dst_0 += v_weight0 * v_src0.s0;
		v_dst_0 += v_weight1 * v_src0.s1;
		v_dst_0 += v_weight2 * v_src0.s2;
		v_dst_0 += v_weight3 * v_src0.s3;

		v_dst_1 += v_weight0 * v_src1.s0;
		v_dst_1 += v_weight1 * v_src1.s1;
		v_dst_1 += v_weight2 * v_src1.s2;
		v_dst_1 += v_weight3 * v_src1.s3;

		v_dst_2 += v_weight0 * v_src2.s0;
		v_dst_2 += v_weight1 * v_src2.s1;
		v_dst_2 += v_weight2 * v_src2.s2;
		v_dst_2 += v_weight3 * v_src2.s3;

		v_dst_3 += v_weight0 * v_src3.s0;
		v_dst_3 += v_weight1 * v_src3.s1;
		v_dst_3 += v_weight2 * v_src3.s2;
		v_dst_3 += v_weight3 * v_src3.s3;
		
	}

	int out_idx = c * output_w_align * 4 + w * 4;
	write_imagef(out_image, (int2)((out_idx >> 2), h), v_dst_0);
	write_imagef(out_image, (int2)((out_idx >> 2) + 1, h), v_dst_1);
	write_imagef(out_image, (int2)((out_idx >> 2) + 2, h), v_dst_2);
	write_imagef(out_image, (int2)((out_idx >> 2) + 3, h), v_dst_3);
}

其中input_image.height = input_height

input_image.width = (input_channel + 3) / 4 * 4 * ((input_width + 3) / 4 * 4);

out_image结构与input_image类似

filter.height = (out_channel + 3) / 4；

filter.width = input_channel * 4;

input格式转换nchw->目标格式

        INT32 dst_row = (src_channel + 3) / 4 * 4;
	INT32 src_width_align = (src_width + 3) / 4 * 4;
	float*dst_input = (float*)calloc(src_height * src_width_align * dst_row, sizeof(float));
	for (int h = 0; h < src_height; h++)
	{
		for (int w = 0; w < src_width; w++)
		{
			for (int c = 0; c < src_channel; c++)
			{
				int dst_idx = (c / 4) * src_width_align * 4 + w * 4 + c % 4;
				dst_input[h * src_width_align * dst_row + dst_idx] =
					src_input[c * src_height * src_width + h * src_width + w];
			}
		}
	}

filter->目标格式

        dst_row = (out_channel + 3) / 4;
	float *dst_weight = (float*)calloc(src_channel * 4 * dst_row,sizeof(float));
	for (int c_out = 0; c_out < out_channel; c_out++)
	{
		for (int c_in = 0; c_in < src_channel; c_in++)
		{
			dst_weight[c_out / 4 * src_channel * 4 + c_in * 4 + c_out % 4] =
			src_weight[c_out * src_channel + c_in];
		}
	}

output目标格式转为nchw：

        dst_row = (out_channel + 3) / 4 * 4;
	src_width_align = (out_width + 3) / 4 * 4;
	float*dst_output = (float*)calloc(out_height * src_width_align * dst_row, sizeof(float));
	for (int h = 0; h < out_height; h++)
	{
		for (int w = 0; w < out_width; w++)
		{
			for (int c = 0; c < out_channel; c++)
			{
				int dst_idx = (c / 4) * src_width_align * 4 + w * 4 + c % 4;
				output[c * src_height * src_width + h * src_width + w]
					= dst_output[h * src_width_align * dst_row + dst_idx];
			}
		}
	}

TNN这种排列方式，可以看出kernel里面的for循环，input_image每次循环的跨幅很大，感觉cache不是很友好。于是我就把feature map重新排了一下看看：

...

c0 c1 c2 c3

c4 c5 c6 c7

...

这样排列的话，感觉同一个线程每次for循环cache好一点。

对应的kernel：

__kernel void kernel_name(__read_only image2d_t input_image,
		const int src_channel,
		__write_only image2d_t output_image,
		const int dst_width,
		const int dst_channel,
		__global float *weight,
		const int weight_width,
		__global float *bias)
{
	const int wc = get_global_id(0);
	const int h = get_global_id(1);

	const int input_c_align = (src_channel + 3) / 4 * 4;
	const int output_c_align = (dst_channel + 3) / 4 * 4;

	int output_w_align = (dst_width + 3) >> 2;
	const int c = wc / output_w_align;
	const int w = (wc % output_w_align);

	output_w_align = output_w_align << 2;
	const int out_c = c << 2;
	const int out_w = w << 2;

	float4 v_bias = vload4(0, bias + out_c);
	float4 v_dst_0 = (float4)(v_bias);
	float4 v_dst_1 = v_dst_0;
	float4 v_dst_2 = v_dst_0;
	float4 v_dst_3 = v_dst_0;

	int src_c_aligned = (src_channel + 3) >> 2;

	__global float* weight_tmp = weight + c * weight_width;
	int idx = w * input_c_align;
	for (int i = 0; i < src_c_aligned; ++i)
	{
		float4 v_weight0 = vload4(0, weight_tmp + i * 4 * 4);
		float4 v_weight1 = vload4(0, weight_tmp + i * 4 * 4 + 4);
		float4 v_weight2 = vload4(0, weight_tmp + i * 4 * 4 + 8);
		float4 v_weight3 = vload4(0, weight_tmp + i * 4 * 4 + 12);

		float4 v_src0 = read_imagef(input_image, sampler, (int2)(idx, h));
		float4 v_src1 = read_imagef(input_image, sampler, (int2)(idx + 1, h));
		float4 v_src2 = read_imagef(input_image, sampler, (int2)(idx + 2, h));
		float4 v_src3 = read_imagef(input_image, sampler, (int2)(idx + 3, h));

		v_dst_0 = mad(v_weight0, v_src0.s0, v_dst_0);
		v_dst_0 = mad(v_weight1, v_src0.s1, v_dst_0);
		v_dst_0 = mad(v_weight2, v_src0.s2, v_dst_0);
		v_dst_0 = mad(v_weight3, v_src0.s3, v_dst_0);

		v_dst_1 = mad(v_weight0, v_src1.s0, v_dst_1);
		v_dst_1 = mad(v_weight1, v_src1.s1, v_dst_1);
		v_dst_1 = mad(v_weight2, v_src1.s2, v_dst_1);
		v_dst_1 = mad(v_weight3, v_src1.s3, v_dst_1);

		v_dst_2 = mad(v_weight0, v_src2.s0, v_dst_2);
		v_dst_2 = mad(v_weight1, v_src2.s1, v_dst_2);
		v_dst_2 = mad(v_weight2, v_src2.s2, v_dst_2);
		v_dst_2 = mad(v_weight3, v_src2.s3, v_dst_2);

		v_dst_3 = mad(v_weight0, v_src3.s0, v_dst_3);
		v_dst_3 = mad(v_weight1, v_src3.s1, v_dst_3);
		v_dst_3 = mad(v_weight2, v_src3.s2, v_dst_3);
		v_dst_3 = mad(v_weight3, v_src3.s3, v_dst_3);

		idx += 4;
	}
	int out_idx = w * output_c_align + c * 4;
	write_imagef(output_image, (int2)(out_idx, h), v_dst_0);
	write_imagef(output_image, (int2)(out_idx + 1, h), v_dst_1);
	write_imagef(output_image, (int2)(out_idx + 2, h), v_dst_2);
	write_imagef(output_image, (int2)(out_idx + 3, h), v_dst_3);
}

input格式转换：

        INT32 dst_row = (src_channel + 3) / 4 * 4;
	INT32 src_width_align = (src_width + 3) / 4 * 4;
	float*dst_input = (float*)calloc(src_height * src_width_align * dst_row, sizeof(float));
	for (int h = 0; h < src_height; h++)
	{
		for (int w = 0; w < src_width; w++)
		{
			for (int c = 0; c < src_channel; c++)
			{
				int dst_idx = (w / 4) * dst_row * 4 + c / 4 * 16 + (w % 4) * 4 + c % 4;
				dst_input[h * src_width_align * dst_row + dst_idx] =
					src_input[c * src_height * src_width + h * src_width + w];
			}
		}
	}

两种kernel性能分析：

1、当输入width较小时，两个kernel性能基本差不多。这就让人感觉for循环内部的input_image连续基本没什么用。可能是cache较少，这部分刚存完weight，后面又得缓存input_image。

2、当输入width较大时，第二个kernel性能明显降低。当并行处理时，多个线程访问的数据连续性较差，比如说第一个线程做w0/w1/w2/w3这四个数据时，第二个线程做w4/w5/w6/w7时，两个线程访问的数据跨幅很大，不方便合并访问。

也就是说排列数据，最好能做到同一个线程能够有效利用一个cache line，不同线程之间访问的数据最好连续。

天才da熊猫

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
TNN conv2d_1x1 opencl计算流程

首先说一下feature map的数据存储格式：(b*h) * (w4* c /4),dim0如下表，dim1为b*h，这点与mace相同。w0 w1 ... wn w0 w1 ... c0 c1 c2 c3 c0 c1 c2 c3 ... c0 c1 c2 c3 c4 c5 c6 c7 c4 c5 c6 c7 ... weight的存储格式：(c_out/4)*(c_in * 4)c_in_0 c_in_1
复制链接

扫一扫