算法如下:
需包含头文件
#include <mmintrin.h> //MMX
#include <xmmintrin.h> //SSE(include mmintrin.h)
#include <emmintrin.h> //SSE2(include xmmintrin.h)
#include <pmmintrin.h> //SSE3(include emmintrin.h)
#include <tmmintrin.h>//SSSE3(include pmmintrin.h)
#include <smmintrin.h>//SSE4.1(include tmmintrin.h)
#include <nmmintrin.h>//SSE4.2(include smmintrin.h)
#include <immintrin.h>//AVX(include wmmintrin.h)
/*
- YCbCrConversion
-
- function: Convert YCbCr 4:2:2 to RGB 8:8:8:8 (last 8 bits are undefined)
- in: pointer to 16bpp YUV buffer (Y, Cb, Y, Cr, Y, Cb, …)
- out pointer to 32bpp RGB buffer
- special uses a hardcoded float to int conversion routine for increased speed.
*/
#define FLOAT_MANTISSA_BITS 23
#define FLOAT_MANTISSA_MASK ((1<<(FLOAT_MANTISSA_BITS))-1)
/* Non-negative float values /
#define FLOAT2INT(flt, int)
(((float *)(&(int)))=(flt)+((float)(1<<(FLOAT_MANTISSA_BITS))),
(int)&=FLOAT_MANTISSA_MASK)
void YCbCrConversion(unsigned char *_pIn, unsigned char *_pOut, int nWidth, int nHeight)
{
int LumaWhite = 240;//FIXME: should match how chip is programmed
int LumaBlack = 16;//FIXME: should match how chip is programmed
int ChromaRange = max(LumaWhite - LumaBlack, 224);
float coeff_R_Cr = 1.4022f;
float coeff_G_Cr = -0.7142f;
float coeff_G_Cb = -0.3442f;
float coeff_B_Cb = 1.7721f;
float Y0, Cb, Y1, Cr;
float R0, G0, B0, R1, G1, B1;
float B_Cb128,R_Cr128,G_CrCb128;
int r0, b0, g0, r1, b1, g1;
int precalc_xy_raw;
int precalc_xy;
sv_u8 *pIn;
sv_u8 *pOut;
int n;
int i;
int inStride = nWidth * 2;
int outStride = nWidth * 4;
// Conversion matrix
// Take Y coefficent from SOC luma clip range
int Y_range = LumaWhite - LumaBlack;
float fY_scale = 255.0f / Y_range;
float fY_offs = (float)LumaBlack;
short Y_scale = (short)(fY_scale * 8192.0f);
short Y_offs = LumaBlack;
// Cb/Cr coefficients, scale by chroma range
int C_range = ChromaRange;
float fC_scale = 255.0f / C_range;
float fR_Cr = coeff_R_Cr * fC_scale;
float fG_Cr = coeff_G_Cr * fC_scale;
float fG_Cb = coeff_G_Cb * fC_scale;
float fB_Cb = coeff_B_Cb * fC_scale;
short R_Cr = (short)(fR_Cr * 8192.0f);
short G_Cr = (short)(fG_Cr * 8192.0f);
short G_Cb = (short)(fG_Cb * 8192.0f);
short B_Cb = (short)(fB_Cb * 8192.0f);
for (int y = 0; y < (int)nHeight; ++y)
{
pIn = (sv_u8 *)_pIn + y * inStride;
pOut = (sv_u8 *)_pOut + y * outStride;
n = nWidth;
i = 0;
// Vectorized loop. Unfinished pixels done in following serial loop.
if (g_bSSE2)
{
// * Algorithm implemented in SSE
// Assume: Cb-Y0-Cr-Y1 byte order [UYVY]
// Y = Y(in) - Y_offs;
// Cb = Cb(in) - 128;
// Cr = Cr(in) - 128;
// Y = Y_scale * Y
// R = Y + 0.0 * Cb + R_Cr * Cr
// G = Y + G_Cb * Cb + G_Cr * Cr
// B = Y + B_Cb * Cb + 0.0 * Cr
__m128i RCbCr = _mm_set_epi16(R_Cr, 0, R_Cr, 0, R_Cr, 0, R_Cr, 0);
__m128i GCbCr = _mm_set_epi16(G_Cr, G_Cb, G_Cr, G_Cb, G_Cr, G_Cb, G_Cr, G_Cb);
__m128i BCbCr = _mm_set_epi16( 0, B_Cb, 0, B_Cb, 0, B_Cb, 0, B_Cb);
__m128i RGBY = _mm_set_epi16(0, Y_scale, 0, Y_scale, 0, Y_scale, 0, Y_scale);
__m128i offsets = _mm_set_epi16(Y_offs, 128, Y_offs, 128, Y_offs, 128, Y_offs, 128);
// Each iteration of the loop does four pixels
for (; i < (n & ~3); i += 4)
{
// Load eight bytes (four pixels)
__m128i raw = _mm_loadl_epi64((__m128i *)(pIn + (i << 1)));
// Zero extend to 8 words and subtract the offsets
raw = _mm_unpacklo_epi8(raw, _mm_setzero_si128());
raw = _mm_sub_epi16(raw, offsets);
// Get Y's and scale them [Y3, Y2, Y1, Y0] (4 x int32)
__m128i ys = _mm_madd_epi16(_mm_srli_epi32(raw, 16), RGBY);
// Get Cb's and Cr's [Cr2, Cb2, Cr2, Cb2, Cr0, Cb0, Cr0, Cb0]
__m128i cbcrs = _mm_shufflelo_epi16(raw, (2<<6)|(0<<4)|(2<<2)|(0));
cbcrs = _mm_shufflehi_epi16(cbcrs, (2<<6)|(0<<4)|(2<<2)|(0));
__m128i Rs, Gs, Bs;
// Red channel
Rs = _mm_madd_epi16(cbcrs, RCbCr);
Rs = _mm_add_epi32(Rs, ys);
Rs = _mm_srai_epi32(Rs, 13); // Four R's as 32-bit ints
Rs = _mm_packs_epi32(Rs, Rs); // Four R's as 16-bit ints (values can be out of range)
Rs = _mm_packus_epi16(Rs, Rs); // Four R's as 8-bit uints, saturated
// Green channel
Gs = _mm_madd_epi16(cbcrs, GCbCr);
Gs = _mm_add_epi32(Gs, ys);
Gs = _mm_srai_epi32(Gs, 13); // Four G's as 32-bit ints
Gs = _mm_packs_epi32(Gs, Gs); // Four G's as 16-bit ints (values can be out of range)
Gs = _mm_packus_epi16(Gs, Rs); // Four G's as 8-bit uints, saturated
// Blue channel
Bs = _mm_madd_epi16(cbcrs, BCbCr);
Bs = _mm_add_epi32(Bs, ys);
Bs = _mm_srai_epi32(Bs, 13); // Four B's as 32-bit ints
Bs = _mm_packs_epi32(Bs, Bs); // Four B's as 16-bit ints (values can be out of range)
Bs = _mm_packus_epi16(Bs, Bs); // Four B's as 8-bit uints, saturated
// Merge into RGB pixels
Gs = _mm_unpacklo_epi8(Bs, Gs); // G3B3G2B2G1B1G0B0G3B3G2B2G1B1G0B0
Rs = _mm_unpacklo_epi8(Rs, _mm_setzero_si128()); // 00R300R200R100R000R300R200R100R0
Rs = _mm_unpacklo_epi16(Gs, Rs); // 00R3G3B300R2G2B200R1G1B100R0G0B0
_mm_storeu_si128((__m128i *)(pOut + (i << 2)), Rs);
}
}//if SSE2
// Serial loop
precalc_xy_raw = 2 * i;
precalc_xy = 4 * i;
// Each iteration of the loop does 2 pixels
for (; i < (n & ~1); i += 2)
{
Cb = (float)pIn[precalc_xy_raw] - 128.0f;
Y0 = fY_scale * ((float)pIn[precalc_xy_raw+1] - fY_offs);
Cr = (float)pIn[precalc_xy_raw+2] - 128.0f;
Y1 = fY_scale * ((float)pIn[precalc_xy_raw+3] - fY_offs);
precalc_xy_raw += 4;
R_Cr128 = fR_Cr*Cr;
G_CrCb128 = fG_Cr*Cr + fG_Cb*Cb;
B_Cb128 = fB_Cb*Cb;
R0 = Y0 + R_Cr128;
G0 = Y0 + G_CrCb128;
B0 = Y0 + B_Cb128;
R1 = Y1 + R_Cr128;
G1 = Y1 + G_CrCb128;
B1 = Y1 + B_Cb128;
if (R0<0.0f) R0=0.0f;
if (G0<0.0f) G0=0.0f;
if (B0<0.0f) B0=0.0f;
if (R0>255.0f) R0 = 255.0f;
if (G0>255.0f) G0 = 255.0f;
if (B0>255.0f) B0 = 255.0f;
if (R1<0.0f) R1=0.0f;
if (G1<0.0f) G1=0.0f;
if (B1<0.0f) B1=0.0f;
if (R1>255.0f) R1 = 255.0f;
if (G1>255.0f) G1 = 255.0f;
if (B1>255.0f) B1 = 255.0f;
FLOAT2INT(R0, r0);
FLOAT2INT(G0, g0);
FLOAT2INT(B0, b0);
FLOAT2INT(R1, r1);
FLOAT2INT(G1, g1);
FLOAT2INT(B1, b1);
pOut[precalc_xy++] = (unsigned char)b0;//_FastFloat2UnsignedChar((DWORD*)&B0);
pOut[precalc_xy++] = (unsigned char)g0;//_FastFloat2UnsignedChar((DWORD*)&G0);
pOut[precalc_xy++] = (unsigned char)r0;//_FastFloat2UnsignedChar((DWORD*)&R0);
precalc_xy++;
pOut[precalc_xy++] = (unsigned char)b1;//_FastFloat2UnsignedChar((DWORD*)&B1);
pOut[precalc_xy++] = (unsigned char)g1;//_FastFloat2UnsignedChar((DWORD*)&G1);
pOut[precalc_xy++] = (unsigned char)r1;//_FastFloat2UnsignedChar((DWORD*)&R1);
precalc_xy++;
}
// Odd column, do one last pixel. Get Cr from previous pixel group
if (i < n)
{
Cb = (float)pIn[precalc_xy_raw+0] - 128.0f;
Y0 = fY_scale * ((float)pIn[precalc_xy_raw+1] - fY_offs);
Cr = (float)pIn[precalc_xy_raw-2] - 128.0f;
precalc_xy_raw += 4;
R_Cr128 = fR_Cr*Cr;
G_CrCb128 = fG_Cr*Cr + fG_Cb*Cb;
B_Cb128 = fB_Cb*Cb;
R0 = Y0 + R_Cr128;
G0 = Y0 + G_CrCb128;
B0 = Y0 + B_Cb128;
if (R0<0.0f) R0=0.0f;
if (G0<0.0f) G0=0.0f;
if (B0<0.0f) B0=0.0f;
if (R0>255.0f) R0 = 255.0f;
if (G0>255.0f) G0 = 255.0f;
if (B0>255.0f) B0 = 255.0f;
FLOAT2INT(R0, r0);
FLOAT2INT(G0, g0);
FLOAT2INT(B0, b0);
pOut[precalc_xy++] = (unsigned char)b0;//_FastFloat2UnsignedChar((DWORD*)&B0);
pOut[precalc_xy++] = (unsigned char)g0;//_FastFloat2UnsignedChar((DWORD*)&G0);
pOut[precalc_xy++] = (unsigned char)r0;//_FastFloat2UnsignedChar((DWORD*)&R0);
}
}
}
————————————————
版权声明:本文为CSDN博主「justcode12356」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/wangjianhong0574/article/details/134495257