第一次写博文,不好意思,写的应该不好,另外我只是C++的业余程序员,C++功底很一般,见谅!
我在做视频识别的工程中要用到YUV转RGB的功能,以前我用过MMX指令的代码,那是网上找的代码,我当时并不懂MMX,也不懂汇编,只是知道MMX比普通代码要快,确实很快,现在知道SSE2比MMX要快一倍,AVX2比SSE2要快一倍,所以想尝试用AVX2来实现YUV转RGB的功能,在网上寻找多次,也没找到AVX2的现成代码,只找到libyuv库中有用AVX2来实现,但测试发现它的性能没有比MMX快4倍,只快一倍多一点,分析发现里面还用了SSE3指令:
__declspec(naked)
void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_rgb
mov ecx, [esp + 12] // width
movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24
convertloop :
movdqu xmm0, [eax] // fetch 16 pixels of argb
movdqu xmm1, [eax + 16]
movdqu xmm2, [eax + 32]
movdqu xmm3, [eax + 48]
lea eax, [eax + 64]
pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
pshufb xmm1, xmm6
pshufb xmm2, xmm6
pshufb xmm3, xmm6
movdqa xmm4, xmm1 // 4 bytes from 1 for 0
psrldq xmm1, 4 // 8 bytes from 1
pslldq xmm4, 12 // 4 bytes from 1 for 0
movdqa xmm5, xmm2 // 8 bytes from 2 for 1
por xmm0, xmm4 // 4 bytes from 1 for 0
pslldq xmm5, 8 // 8 bytes from 2 for 1
movdqu[edx], xmm0 // store 0
por xmm1, xmm5 // 8 bytes from 2 for 1
psrldq xmm2, 8 // 4 bytes from 2
pslldq xmm3, 4 // 12 bytes from 3 for 2
por xmm2, xmm3 // 12 bytes from 3 for 2
movdqu[edx + 16], xmm1 // store 1
movdqu[edx + 32], xmm2 // store 2
lea edx, [edx + 48]
sub ecx, 16
jg convertloop
ret
}
}
这是将RGBA转成RGB的代码,这里降低了性能
下面是AVX2的YUV420转RGBA代码,libyuv里面的,汇编格式的宏
#define YUVTORGB_AVX2(YuvConstants) __asm { \
__asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
__asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
__asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
__asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \
__asm vpsubw ymm2, ymm3, ymm2 \
__asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \
__asm vpsubw ymm1, ymm3, ymm1 \
__asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \
__asm vpsubw ymm0, ymm3, ymm0 \
/* Step 2: Find Y contribution to 16 R,G,B values */ \
__asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \
__asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \
__asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \
__asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \
__asm vpsraw ymm0, ymm0, 6 \
__asm vpsraw ymm1, ymm1, 6 \
__asm vpsraw ymm2, ymm2, 6 \
__asm vpackuswb ymm0, ymm0, ymm0 /* B */ \
__asm vpackuswb ymm1, ymm1, ymm1 /* G */ \
__asm vpackuswb ymm2, ymm2, ymm2 /* R */ \
}
所以我尝试自己写AVX2的代码,但我没有汇编基础,很难真接在libyuv的基础上改,所以一直在网上找其它方法,
后来才知道C++有AVX2、SSE2等系列的非汇编调用的方法,但基本上要在VC2005以上的版本下才可以使用。
这种非汇编的方式我应该可以尝试,后来我在网上找到有人用SSE2的非汇编方式的YUV转RGB32的代码,我就是从这里开始升级到AVX2,下面是SSE2的代码:
void yuv420_to_argb8888( uint8_t *yp, uint8_t *up, uint8_t *vp,
uint32_t sy, uint32_t suv,
int width, int height,
uint32_t *rgb, uint32_t srgb )
{
__m128i y0r0, y0r1, u0, v0;
__m128i y00r0, y01r0, y00r1, y01r1;
__m128i u00, u01, v00, v01;
__m128i rv00, rv01, gu00, gu01, gv00, gv01, bu00, bu01;
__m128i r00, r01, g00, g01, b00, b01;
__m128i rgb0123, rgb4567, rgb89ab, rgbcdef;
__m128i gbgb;
__m128i ysub, uvsub;
__m128i zero, facy, facrv, facgu, facgv, facbu;
__m128i *srcy128r0, *srcy128r1;
__m128i *dstrgb128r0, *dstrgb128r1;
__m64 *srcu64, *srcv64;
int x, y;
ysub = _mm_set1_epi32( 0x00100010 );
uvsub = _mm_set1_epi32( 0x00800080 );
facy = _mm_set1_epi32( 0x004a004a );
facrv = _mm_set1_epi32( 0x00660066 );
facgu = _mm_set1_epi32( 0x00190019 );
facgv = _mm_set1_epi32( 0x00340034 );
facbu = _mm_set1_epi32( 0x00810081 );
zero = _mm_set1_epi32( 0x00000000 );
for( y = 0; y < height; y += 2 ) {
srcy128r0 = (__m128i *)