最近接到一个任务,需要提升RGB转NV12的C++代码的性能,经过几天的反复调试,把一张1080P的RGB图片转换时间从8ms提升到了4ms。使用了下面几个技巧:
1、把for循环拆分为2个循环,分别计算亮度(Y)与色差值(UV),方便循环展开。
2、定义Calc_y()与Calc_uv() inline函数,把亮度(Y)与色差值(UV)计算进行循环展开。循环展开优化性能是反直觉的,循环展开是利用CPU的指令流水线进行优化,平铺的代码更适合指令流水线。循环展开后转换时间减少了2ms。
3、使用指针直接访问数据代替循环体内的memcpy,转换时间再次减少了2ms。
// 优化后的代码
inline void Calc_y(const RGB* inRgb,int offset, uint8_t* out)
{
const RGB* rgbByte = &inRgb[offset];
*out = Y_R[rgbByte->b] + Y_G[rgbByte->g] + Y_B[rgbByte->r];
}
inline void Calc_uv(const RGB* inRgb, int offset, uint8_t* out)
{
const RGB* rgbByte = &inRgb[offset];
out[0] = U_B[rgbByte->r] - U_R[rgbByte->b] - U_G[rgbByte->g] + 128; //U
out[1] = U_B[rgbByte->b] - V_G[rgbByte->g] - V_B[rgbByte->r] + 128; //V
}
void RGB2YUV_NV12(uint8_t* rgbBufIn, uint8_t* yuvBufOut, int nWidth, int nHeight)
{
int pix = 0;
int pixP4 = nWidth * nHeight;
const RGB* inRgb = (RGB*)rgbBufIn;
int x, y, rgb_offset;
const RGB *rgbByte;
//MPRINTF("size: %d, %p\n", sizeof(struct RGB), rgbBufIn);
int batchSize = 10;
int rem = nWidth % batchSize;
int batch = nWidth - rem;
for (y = nHeight - 1; y >= 0; --y) //line
{
rgb_offset = y * nWidth;
for (x = 0; x < batch; x+= batchSize) //pixf
{
int offset = rgb_offset + x;
Calc_y(inRgb, offset, &yuvBufOut[pix]);
Calc_y(inRgb, offset + 1, &yuvBufOut[pix + 1]);
Calc_y(inRgb, offset + 2, &yuvBufOut[pix + 2]);
Calc_y(inRgb, offset + 3, &yuvBufOut[pix + 3]);
Calc_y(inRgb, offset + 4, &yuvBufOut[pix + 4]);
Calc_y(inRgb, offset + 5, &yuvBufOut[pix + 5]);
Calc_y(inRgb, offset + 6, &yuvBufOut[pix + 6]);
Calc_y(inRgb, offset + 7, &yuvBufOut[pix + 7]);
Calc_y(inRgb, offset + 8, &yuvBufOut[pix + 8]);
Calc_y(inRgb, offset + 9, &yuvBufOut[pix + 9]);
pix += batchSize;
}
rgb_offset += batch;
for (int i = 0; i < rem; i++) {
int offset = rgb_offset + i;
Calc_y(inRgb, offset, &yuvBufOut[pix]);
pix++;
}
}
for (y = nHeight - 1; y >= 0; y-=2) //line
{
rgb_offset = y * nWidth;
for (x = 0; x < batch; x += batchSize) //pixf
{
int offset = rgb_offset + x;
Calc_uv(inRgb, offset, &yuvBufOut[pixP4]);
Calc_uv(inRgb, offset + 2, &yuvBufOut[pixP4 + 2]);
Calc_uv(inRgb, offset + 4, &yuvBufOut[pixP4 + 4]);
Calc_uv(inRgb, offset + 6, &yuvBufOut[pixP4 + 6]);
Calc_uv(inRgb, offset + 8, &yuvBufOut[pixP4 + 8]);
pixP4 += batchSize;
}
rgb_offset += batch;
for (int i = 0; i < rem; i += 2) {
int offset = rgb_offset + i;
Calc_uv(inRgb, offset, &yuvBufOut[pixP4]);
pixP4 += 2;
}
}
}
附上优化前的代码,供对比。
//优化前的代码
void RGB2YUV_NV12(uint8_t* rgbBufIn, uint8_t* yuvBufOut, int nWidth, int nHeight)
{
int pix = 0;
int pixP4 = nWidth * nHeight;
const RGB* inRgb = (RGB*)rgbBufIn;
int x, y, val, rgb_offset;
RGB rgbByte;
//MPRINTF("size: %d, %p\n", sizeof(struct RGB), rgbBufIn);
for (y = nHeight - 1; y >= 0 ; --y) //line
{
rgb_offset = y * nWidth;
for (x = 0; x < nWidth; ++x) //pixf
{
//rgbByte = inRgb[rgb_offset + x];
memcpy(&rgbByte, &inRgb[rgb_offset + x], sizeof(rgbByte));
yuvBufOut[pix] = Y_R[rgbByte.b] + Y_G[rgbByte.g] + Y_B[rgbByte.r];//Y
if (x & y & 1)
{
//U
val = U_B[rgbByte.r] - U_R[rgbByte.b] - U_G[rgbByte.g] + 128;
yuvBufOut[pixP4++] = val;
//V
val = U_B[rgbByte.b] - V_G[rgbByte.g] - V_B[rgbByte.r] + 128;
yuvBufOut[pixP4++] = val;
}
++pix;
}
}
}
附录:
1、评论提到的RGB结构体定义如下:
typedef struct RGB_ {
unsigned char r;
unsigned char g;
unsigned char b;
unsigned char a;// rgba32
}RGB;
2、评论中提到Y_R[], Y_G[], Y_B[] 等数组是查表法转换YUV用到的数据结构,定义如下:
#define COLORSIZE 256
unsigned short Y_R[COLORSIZE],Y_G[COLORSIZE],Y_B[COLORSIZE];
unsigned short U_R[COLORSIZE],U_G[COLORSIZE],U_B[COLORSIZE];
unsigned short V_G[COLORSIZE],V_B[COLORSIZE];
void table_init()
{
int i;
for(i = 0; i < COLORSIZE; i++)
{
Y_R[i] = (i * 1224) >> 12; //0.2988
Y_G[i] = (i * 2404) >> 12; //0.5869
Y_B[i] = (i * 469) >> 12; //0.1162
U_R[i] = (i * 692) >> 12; //0.1688
U_G[i] = (i * 1356) >> 12; //0.3312
U_B[i] = i /*(* 2048) */>> 1; //0.5
// V_R[i] = (i * 2048) >> 12; //
V_G[i] = (i * 1731) >> 12; //0.4184
V_B[i] = (i * 334) >> 12; //0.0816
}
}