yuv 422 转rgb32 simd sse加速算法

算法如下:

需包含头文件

#include <mmintrin.h> //MMX
#include <xmmintrin.h> //SSE(include mmintrin.h)
#include <emmintrin.h> //SSE2(include xmmintrin.h)
#include <pmmintrin.h> //SSE3(include emmintrin.h)
#include <tmmintrin.h>//SSSE3(include pmmintrin.h)
#include <smmintrin.h>//SSE4.1(include tmmintrin.h)
#include <nmmintrin.h>//SSE4.2(include smmintrin.h)
#include <immintrin.h>//AVX(include wmmintrin.h)

/*

  • YCbCrConversion

  • function: Convert YCbCr 4:2:2 to RGB 8:8:8:8 (last 8 bits are undefined)
  • in: pointer to 16bpp YUV buffer (Y, Cb, Y, Cr, Y, Cb, …)
  • out pointer to 32bpp RGB buffer
  • special uses a hardcoded float to int conversion routine for increased speed.
    */

#define FLOAT_MANTISSA_BITS 23
#define FLOAT_MANTISSA_MASK ((1<<(FLOAT_MANTISSA_BITS))-1)

/* Non-negative float values /
#define FLOAT2INT(flt, int)
(
((float *)(&(int)))=(flt)+((float)(1<<(FLOAT_MANTISSA_BITS))),
(int)&=FLOAT_MANTISSA_MASK)

void YCbCrConversion(unsigned char *_pIn, unsigned char *_pOut, int nWidth, int nHeight)
{
int LumaWhite = 240;//FIXME: should match how chip is programmed
int LumaBlack = 16;//FIXME: should match how chip is programmed
int ChromaRange = max(LumaWhite - LumaBlack, 224);
float coeff_R_Cr = 1.4022f;
float coeff_G_Cr = -0.7142f;
float coeff_G_Cb = -0.3442f;
float coeff_B_Cb = 1.7721f;

float Y0, Cb, Y1, Cr;
float R0, G0, B0, R1, G1, B1;
float B_Cb128,R_Cr128,G_CrCb128;
int   r0, b0, g0, r1, b1, g1;

int precalc_xy_raw;
int precalc_xy;

sv_u8 *pIn;
sv_u8 *pOut;

int     n;
int     i;
int     inStride = nWidth * 2;
int     outStride = nWidth * 4;

//  Conversion matrix
//  Take Y coefficent from SOC luma clip range
int    Y_range = LumaWhite - LumaBlack;
float fY_scale = 255.0f / Y_range;
float fY_offs  = (float)LumaBlack;
short  Y_scale = (short)(fY_scale * 8192.0f);
short  Y_offs  = LumaBlack;
//  Cb/Cr coefficients, scale by chroma range
int    C_range = ChromaRange;
float fC_scale = 255.0f / C_range;
float fR_Cr    = coeff_R_Cr * fC_scale;
float fG_Cr    = coeff_G_Cr * fC_scale;
float fG_Cb    = coeff_G_Cb * fC_scale;
float fB_Cb    = coeff_B_Cb * fC_scale;
short  R_Cr    = (short)(fR_Cr * 8192.0f);
short  G_Cr    = (short)(fG_Cr * 8192.0f);
short  G_Cb    = (short)(fG_Cb * 8192.0f);
short  B_Cb    = (short)(fB_Cb * 8192.0f);

for (int y = 0; y < (int)nHeight; ++y)
{
    pIn  = (sv_u8 *)_pIn + y * inStride;
    pOut = (sv_u8 *)_pOut + y * outStride;
    n = nWidth;
    i = 0;

    //  Vectorized loop. Unfinished pixels done in following serial loop.
  if (g_bSSE2)
    {
        // * Algorithm implemented in SSE
        //  Assume: Cb-Y0-Cr-Y1 byte order [UYVY]
        //  Y = Y(in) - Y_offs;
        //  Cb = Cb(in) - 128;
        //  Cr = Cr(in) - 128;
        //  Y = Y_scale * Y
        //  R = Y +  0.0 * Cb + R_Cr * Cr
        //  G = Y + G_Cb * Cb + G_Cr * Cr
        //  B = Y + B_Cb * Cb +  0.0 * Cr
        __m128i RCbCr   = _mm_set_epi16(R_Cr,    0, R_Cr,    0, R_Cr,    0, R_Cr,    0);
        __m128i GCbCr   = _mm_set_epi16(G_Cr, G_Cb, G_Cr, G_Cb, G_Cr, G_Cb, G_Cr, G_Cb);
        __m128i BCbCr   = _mm_set_epi16(   0, B_Cb,    0, B_Cb,    0, B_Cb,    0, B_Cb);
        __m128i RGBY    = _mm_set_epi16(0, Y_scale, 0, Y_scale, 0, Y_scale, 0, Y_scale);
        __m128i offsets = _mm_set_epi16(Y_offs, 128, Y_offs, 128, Y_offs, 128, Y_offs, 128);

        //  Each iteration of the loop does four pixels
        for (; i < (n & ~3); i += 4)
        {
            //  Load eight bytes (four pixels)
            __m128i raw = _mm_loadl_epi64((__m128i *)(pIn + (i << 1)));
            //  Zero extend to 8 words and subtract the offsets
            raw = _mm_unpacklo_epi8(raw, _mm_setzero_si128());
            raw = _mm_sub_epi16(raw, offsets);
            //  Get Y's and scale them [Y3, Y2, Y1, Y0] (4 x int32)
            __m128i ys = _mm_madd_epi16(_mm_srli_epi32(raw, 16), RGBY);
            //  Get Cb's and Cr's [Cr2, Cb2, Cr2, Cb2, Cr0, Cb0, Cr0, Cb0]
            __m128i cbcrs = _mm_shufflelo_epi16(raw, (2<<6)|(0<<4)|(2<<2)|(0));
            cbcrs = _mm_shufflehi_epi16(cbcrs, (2<<6)|(0<<4)|(2<<2)|(0));

            __m128i Rs, Gs, Bs;
            //  Red channel
            Rs = _mm_madd_epi16(cbcrs, RCbCr);
            Rs = _mm_add_epi32(Rs, ys);
            Rs = _mm_srai_epi32(Rs, 13); //  Four R's as 32-bit ints
            Rs = _mm_packs_epi32(Rs, Rs); //  Four R's as 16-bit ints (values can be out of range)
            Rs = _mm_packus_epi16(Rs, Rs); //  Four R's as 8-bit uints, saturated

            //  Green channel
            Gs = _mm_madd_epi16(cbcrs, GCbCr);
            Gs = _mm_add_epi32(Gs, ys);
            Gs = _mm_srai_epi32(Gs, 13); //  Four G's as 32-bit ints
            Gs = _mm_packs_epi32(Gs, Gs); //  Four G's as 16-bit ints (values can be out of range)
            Gs = _mm_packus_epi16(Gs, Rs); //  Four G's as 8-bit uints, saturated

            //  Blue channel
            Bs = _mm_madd_epi16(cbcrs, BCbCr);
            Bs = _mm_add_epi32(Bs, ys);
            Bs = _mm_srai_epi32(Bs, 13); //  Four B's as 32-bit ints
            Bs = _mm_packs_epi32(Bs, Bs); //  Four B's as 16-bit ints (values can be out of range)
            Bs = _mm_packus_epi16(Bs, Bs); //  Four B's as 8-bit uints, saturated

            //  Merge into RGB pixels
            Gs = _mm_unpacklo_epi8(Bs, Gs);                   // G3B3G2B2G1B1G0B0G3B3G2B2G1B1G0B0
            Rs = _mm_unpacklo_epi8(Rs, _mm_setzero_si128());  // 00R300R200R100R000R300R200R100R0
            Rs = _mm_unpacklo_epi16(Gs, Rs);                  // 00R3G3B300R2G2B200R1G1B100R0G0B0

            _mm_storeu_si128((__m128i *)(pOut + (i << 2)), Rs);
        }
    }//if SSE2

    //  Serial loop
    precalc_xy_raw = 2 * i;
    precalc_xy = 4 * i;
    //  Each iteration of the loop does 2 pixels
    for (; i < (n & ~1); i += 2)
    {
        Cb =            (float)pIn[precalc_xy_raw] - 128.0f;
        Y0 = fY_scale * ((float)pIn[precalc_xy_raw+1] - fY_offs);
        Cr =            (float)pIn[precalc_xy_raw+2] - 128.0f;
        Y1 = fY_scale * ((float)pIn[precalc_xy_raw+3] - fY_offs);
        precalc_xy_raw += 4;
 
        R_Cr128   =  fR_Cr*Cr;
        G_CrCb128 =  fG_Cr*Cr + fG_Cb*Cb;
        B_Cb128   =  fB_Cb*Cb;

        R0 = Y0 + R_Cr128;
        G0 = Y0 + G_CrCb128;
        B0 = Y0 + B_Cb128;
        
        R1 = Y1 + R_Cr128;
        G1 = Y1 + G_CrCb128;
        B1 = Y1 + B_Cb128;

        if (R0<0.0f) R0=0.0f;
        if (G0<0.0f) G0=0.0f;
        if (B0<0.0f) B0=0.0f;
        if (R0>255.0f) R0 = 255.0f;
        if (G0>255.0f) G0 = 255.0f;
        if (B0>255.0f) B0 = 255.0f;

        if (R1<0.0f) R1=0.0f;
        if (G1<0.0f) G1=0.0f;
        if (B1<0.0f) B1=0.0f;
        if (R1>255.0f) R1 = 255.0f;
        if (G1>255.0f) G1 = 255.0f;
        if (B1>255.0f) B1 = 255.0f;

        FLOAT2INT(R0, r0);
        FLOAT2INT(G0, g0);
        FLOAT2INT(B0, b0);

        FLOAT2INT(R1, r1);
        FLOAT2INT(G1, g1);
        FLOAT2INT(B1, b1);

        pOut[precalc_xy++] = (unsigned char)b0;//_FastFloat2UnsignedChar((DWORD*)&B0);
        pOut[precalc_xy++] = (unsigned char)g0;//_FastFloat2UnsignedChar((DWORD*)&G0);
        pOut[precalc_xy++] = (unsigned char)r0;//_FastFloat2UnsignedChar((DWORD*)&R0);
        precalc_xy++; 

        pOut[precalc_xy++] = (unsigned char)b1;//_FastFloat2UnsignedChar((DWORD*)&B1);
        pOut[precalc_xy++] = (unsigned char)g1;//_FastFloat2UnsignedChar((DWORD*)&G1);
        pOut[precalc_xy++] = (unsigned char)r1;//_FastFloat2UnsignedChar((DWORD*)&R1);
        precalc_xy++;
    }

    //  Odd column, do one last pixel. Get Cr from previous pixel group
    if (i < n)
    {
        Cb =            (float)pIn[precalc_xy_raw+0] - 128.0f;
        Y0 = fY_scale * ((float)pIn[precalc_xy_raw+1] - fY_offs);
        Cr =            (float)pIn[precalc_xy_raw-2] - 128.0f;
        precalc_xy_raw += 4;
 
        R_Cr128   =  fR_Cr*Cr;
        G_CrCb128 =  fG_Cr*Cr + fG_Cb*Cb;
        B_Cb128   =  fB_Cb*Cb;
        R0 = Y0 + R_Cr128;
        G0 = Y0 + G_CrCb128;
        B0 = Y0 + B_Cb128;
        if (R0<0.0f) R0=0.0f;
        if (G0<0.0f) G0=0.0f;
        if (B0<0.0f) B0=0.0f;
        if (R0>255.0f) R0 = 255.0f;
        if (G0>255.0f) G0 = 255.0f;
        if (B0>255.0f) B0 = 255.0f;
        FLOAT2INT(R0, r0);
        FLOAT2INT(G0, g0);
        FLOAT2INT(B0, b0);
        pOut[precalc_xy++] = (unsigned char)b0;//_FastFloat2UnsignedChar((DWORD*)&B0);
        pOut[precalc_xy++] = (unsigned char)g0;//_FastFloat2UnsignedChar((DWORD*)&G0);
        pOut[precalc_xy++] = (unsigned char)r0;//_FastFloat2UnsignedChar((DWORD*)&R0);
    }
}

}
————————————————
版权声明:本文为CSDN博主「justcode12356」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/wangjianhong0574/article/details/134495257

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值