如何把RGB转NV12的C++代码性能提高1倍

彭朝劲

已于 2023-09-20 11:51:22 修改

阅读量1.2k

点赞数 4

分类专栏： VS工程文章标签： c++ 算法开发语言

于 2023-03-07 20:33:02 首次发布

本文链接：https://blog.csdn.net/u013705518/article/details/129390545

版权

VS工程专栏收录该内容

13 篇文章 1 订阅

订阅专栏

最近接到一个任务，需要提升RGB转NV12的C++代码的性能，经过几天的反复调试，把一张1080P的RGB图片转换时间从8ms提升到了4ms。使用了下面几个技巧：

1、把for循环拆分为2个循环，分别计算亮度（Y）与色差值（UV），方便循环展开。

2、定义Calc_y()与Calc_uv() inline函数，把亮度（Y）与色差值（UV）计算进行循环展开。循环展开优化性能是反直觉的，循环展开是利用CPU的指令流水线进行优化，平铺的代码更适合指令流水线。循环展开后转换时间减少了2ms。

3、使用指针直接访问数据代替循环体内的memcpy，转换时间再次减少了2ms。

// 优化后的代码

inline void Calc_y(const RGB* inRgb,int offset, uint8_t* out)
{
	const RGB* rgbByte = &inRgb[offset];
	*out = Y_R[rgbByte->b] + Y_G[rgbByte->g] + Y_B[rgbByte->r];
}

inline void Calc_uv(const RGB* inRgb, int offset, uint8_t* out)
{
	const RGB* rgbByte = &inRgb[offset];
	out[0] = U_B[rgbByte->r] - U_R[rgbByte->b] - U_G[rgbByte->g] + 128; //U
	out[1] = U_B[rgbByte->b] - V_G[rgbByte->g] - V_B[rgbByte->r] + 128; //V
}

void RGB2YUV_NV12(uint8_t* rgbBufIn, uint8_t* yuvBufOut, int nWidth, int nHeight)
{
	int pix = 0;
	int pixP4 = nWidth * nHeight;

	const RGB* inRgb = (RGB*)rgbBufIn;
	int x, y, rgb_offset;
	const RGB *rgbByte;

	//MPRINTF("size: %d, %p\n", sizeof(struct RGB), rgbBufIn);

	int batchSize = 10;
	int rem = nWidth % batchSize;
	int batch = nWidth - rem;

	for (y = nHeight - 1; y >= 0; --y) //line
	{
		rgb_offset = y * nWidth;
		for (x = 0; x < batch; x+= batchSize) //pixf
		{
			int offset = rgb_offset + x;
			Calc_y(inRgb, offset, &yuvBufOut[pix]);
			Calc_y(inRgb, offset + 1, &yuvBufOut[pix + 1]);
			Calc_y(inRgb, offset + 2, &yuvBufOut[pix + 2]);
			Calc_y(inRgb, offset + 3, &yuvBufOut[pix + 3]);
			Calc_y(inRgb, offset + 4, &yuvBufOut[pix + 4]);
			Calc_y(inRgb, offset + 5, &yuvBufOut[pix + 5]);
			Calc_y(inRgb, offset + 6, &yuvBufOut[pix + 6]);
			Calc_y(inRgb, offset + 7, &yuvBufOut[pix + 7]);
			Calc_y(inRgb, offset + 8, &yuvBufOut[pix + 8]);
			Calc_y(inRgb, offset + 9, &yuvBufOut[pix + 9]);
			pix += batchSize;
		}

		rgb_offset += batch;
		for (int i = 0; i < rem; i++) {
			int offset = rgb_offset + i;
			Calc_y(inRgb, offset, &yuvBufOut[pix]);
			pix++;
		}
	}

	for (y = nHeight - 1; y >= 0; y-=2) //line
	{
		rgb_offset = y * nWidth;
		for (x = 0; x < batch; x += batchSize) //pixf
		{
			int offset = rgb_offset + x;
			Calc_uv(inRgb, offset, &yuvBufOut[pixP4]);
			Calc_uv(inRgb, offset + 2, &yuvBufOut[pixP4 + 2]);
			Calc_uv(inRgb, offset + 4, &yuvBufOut[pixP4 + 4]);
			Calc_uv(inRgb, offset + 6, &yuvBufOut[pixP4 + 6]);
			Calc_uv(inRgb, offset + 8, &yuvBufOut[pixP4 + 8]);

			pixP4 += batchSize;
		}

		rgb_offset += batch;
		for (int i = 0; i < rem; i += 2) {
			int offset = rgb_offset + i;
			Calc_uv(inRgb, offset, &yuvBufOut[pixP4]);
			pixP4 += 2;
		}
	}
}

附上优化前的代码，供对比。

//优化前的代码

void RGB2YUV_NV12(uint8_t* rgbBufIn, uint8_t* yuvBufOut, int nWidth, int nHeight)
{
	int pix = 0;
	int pixP4 = nWidth * nHeight;

	const RGB* inRgb = (RGB*)rgbBufIn;
	int x, y, val, rgb_offset;
	RGB rgbByte;

	//MPRINTF("size: %d, %p\n", sizeof(struct RGB), rgbBufIn);

	for (y = nHeight - 1; y >= 0 ; --y) //line
	{
		rgb_offset = y * nWidth;
		for (x = 0; x < nWidth; ++x) //pixf
		{
			//rgbByte = inRgb[rgb_offset + x];
			memcpy(&rgbByte, &inRgb[rgb_offset + x], sizeof(rgbByte));
			yuvBufOut[pix] = Y_R[rgbByte.b] + Y_G[rgbByte.g] + Y_B[rgbByte.r];//Y

			if (x & y & 1)
			{
				//U
				val = U_B[rgbByte.r] - U_R[rgbByte.b] - U_G[rgbByte.g] + 128;
				yuvBufOut[pixP4++] = val;
				//V
				val = U_B[rgbByte.b] - V_G[rgbByte.g] - V_B[rgbByte.r] + 128;
				yuvBufOut[pixP4++] = val;
			}
			++pix;
		}
	}
}

附录：

1、评论提到的RGB结构体定义如下：

typedef struct RGB_ {
	unsigned char r;
	unsigned char g;
	unsigned char b;
	unsigned char a;// rgba32
}RGB;

2、评论中提到Y_R[], Y_G[], Y_B[] 等数组是查表法转换YUV用到的数据结构，定义如下：

#define COLORSIZE 256
unsigned short Y_R[COLORSIZE],Y_G[COLORSIZE],Y_B[COLORSIZE]；
unsigned short U_R[COLORSIZE],U_G[COLORSIZE],U_B[COLORSIZE]；
unsigned short V_G[COLORSIZE],V_B[COLORSIZE];

void table_init()
{
	int i;
	for(i = 0; i < COLORSIZE; i++)
	{
		Y_R[i] = (i * 1224) >> 12; //0.2988
		Y_G[i] = (i * 2404) >> 12;	//0.5869
		Y_B[i] = (i * 469)	>> 12; //0.1162
		U_R[i] = (i * 692)	>> 12; //0.1688
		U_G[i] = (i * 1356) >> 12;	//0.3312
		U_B[i] = i /*(* 2048) */>> 1; //0.5
	    // V_R[i] = (i * 2048) >> 12;	//
		V_G[i] = (i * 1731) >> 12;	//0.4184
		V_B[i] = (i * 334)	>> 12; //0.0816
	}
}