YUV转RGB--使用MMX和CUDA优化

http://blog.csdn.net/mikedai/article/details/64215860这篇博文中,采用查表的方式优化YUV转RGB,但是仍然没有发挥计算机硬件性能,对1920x1080的YV12格式,在intel i7-7700上测试120fps,也即是每帧耗时8ms。
采用mmx结合查找表优化,首先创建查找表,跟之前博文里有点不一样:

static  short CoefficientsRGBY[256][4];
static  short CoefficientsRGBU[256][4];
static  short CoefficientsRGBV[256][4];
void init_coefficients()
{
    int i;
    for (i = 0; i < 256; i++)
    {   
        CoefficientsRGBY[i][0] = (short)(1.164 * 64 * (i - 16) + 0.5);
        CoefficientsRGBY[i][1] = (short)(1.164 * 64 * (i - 16) + 0.5);
        CoefficientsRGBY[i][2] = (short)(1.164 * 64 * (i - 16) + 0.5);
        CoefficientsRGBY[i][3] = 0x00;

        CoefficientsRGBU[i][0] = (short)(2.018 * 64 * (i - 128) + 0.5);
        CoefficientsRGBU[i][1] = (short)(-0.391 * 64 * (i - 128) + 0.5);
        CoefficientsRGBU[i][2] = 0x00;
        CoefficientsRGBU[i][3] = 0x00;

        CoefficientsRGBV[i][0] = 0x00;
        CoefficientsRGBV[i][1] = (short)(-0.813 * 64 * (i - 128) + 0.5);
        CoefficientsRGBV[i][2] = (short)(1.596 * 64 * (i - 128) + 0.5);
        CoefficientsRGBV[i][3] = 0x00;
    }
}

这里只是采用了16位整数作为查找表的值,精度上可能比32位稍差,但是对于大多数应用已经足够。
采用mmx实现:

int YUV420_RGB32_mmx(uint32_t* rgb, int width, int height, uint8_t* y, uint8_t* u, uint8_t* v)
{
    __asm{
            pushad
            finit
            xor eax, eax
            mov ebx, height
            mov ecx, width
            mov edx, y
            mov edi, v
            mov esi, u
            mov ebp, rgb
        hloop:
            push ebx
            mov ebx, ecx
        wloop :
            push ebx
            xor ebx, ebx
            mov al, [edi]
            mov bl, [esi]
            movq mm0, [CoefficientsRGBU + 8*eax]
            paddw mm0, [CoefficientsRGBV + 8*ebx]
            mov al, [edx]
            mov bl, [edx + 1]
            movq mm1, [CoefficientsRGBY + 8 * eax]
            movq mm2, [CoefficientsRGBY + 8 * ebx]
            mov al, [edx + ecx]
            mov bl, [edx + ecx + 1]
            movq mm3, [CoefficientsRGBY + 8 * eax]
            movq mm4, [CoefficientsRGBY + 8 * ebx]
            paddw mm1, mm0
            paddw mm2, mm0
            paddw mm3, mm0
            paddw mm4, mm0
            psraw mm1, 6
            psraw mm2, 6
            psraw mm3, 6
            psraw mm4, 6
            packuswb mm1, mm2
            packuswb mm3, mm4
            movq[ebp], mm1
            movq[ebp + 4 * ecx], mm3
            add ebp, 8
            add edx, 2
            add edi, 1
            add esi, 1
            pop ebx
            sub ebx, 2
            jnz wloop
            lea ebp, [ebp + 4*ecx]
            add edx, ecx
            pop ebx
            sub ebx, 2
            jnz hloop
            emms
            popad
    }
}

测试结果:1920x1080,YV12,100帧,172ms,将近600fps,跟之前查表实现提升5倍速度。
原因分析:采用mmx一次能读取8字节数据,另外,

            paddw mm1, mm0
            paddw mm2, mm0
            paddw mm3, mm0
            paddw mm4, mm0

相当于一次并行处理处理4个像素。
**缺点:64位下编译不通过,国外网站有说可以单独编写成汇编文件,不要混用,可以在64位下编译,但是没有试过。

使用mmx或者sse指令可以实现一些低度并行计算,姑且称为低度并行计算,但是要实现高度并行计算并获取更高的效率,需要采用GPU。

使用CUDA实现,参考nvidia Video_Codec_SDK和opencv中的代码:

void cudasafe(int error, char* message, char* file, int line) {
    if (error != cudaSuccess) {
        fprintf(stderr, "CUDA Error: %s : %i. In %s line %d\n", message, error, file, line);
        exit(-1);
    }
}
__host__ __device__ __forceinline__ int divUp(int total, int grain)
{
    return (total + grain - 1) / grain;
}

typedef unsigned char uchar;
typedef unsigned int  uint;

__device__ static void YUV2RGB(const uint* yuvi, float* red, float* green, float* blue)
{
    float luma, chromaCb, chromaCr;

    // Prepare for hue adjustment
    luma = (float)yuvi[0];
    chromaCb = (float)((int)yuvi[1] - 512.0f);
    chromaCr = (float)((int)yuvi[2] - 512.0f);

    // Convert YUV To RGB with hue adjustment
    *red = (luma     * constHueColorSpaceMat[0]) +
        (chromaCb * constHueColorSpaceMat[1]) +
        (chromaCr * constHueColorSpaceMat[2]);

    *green = (luma     * constHueColorSpaceMat[3]) +
        (chromaCb * constHueColorSpaceMat[4]) +
        (chromaCr * constHueColorSpaceMat[5]);

    *blue = (luma     * constHueColorSpaceMat[6]) +
        (chromaCb * constHueColorSpaceMat[7]) +
        (chromaCr * constHueColorSpaceMat[8]);
}

__device__ static uint RGBA_pack_10bit(float red, float green, float blue, uint alpha)
{
    uint ARGBpixel = 0;

    // Clamp final 10 bit results
    red = min(max(red, 0.0f), 1023.f);
    green = min(max(green, 0.0f), 1023.f);
    blue = min(max(blue, 0.0f), 1023.f);

    // Convert to 8 bit unsigned integers per color component
    ARGBpixel = (((uint)blue >> 2) |
        (((uint)green >> 2) << 8) |
        (((uint)red >> 2) << 16) |
        (uint)alpha);

    return ARGBpixel;
}

// CUDA kernel for outputing the final ARGB output from NV12

#define COLOR_COMPONENT_BIT_SIZE 10
#define COLOR_COMPONENT_MASK     0x3FF


__global__ void NV12_to_RGB(const uchar* srcImage, size_t nSourcePitch,
    uint* dstImage, size_t nDestPitch,
    uint width, uint height)
{
    // Pad borders with duplicate pixels, and we multiply by 2 because we process 2 pixels per thread
    const int x = blockIdx.x * (blockDim.x << 1) + (threadIdx.x << 1);
    const int y = blockIdx.y *  blockDim.y + threadIdx.y;

    if (x >= width || y >= height)
        return;

    // Read 2 Luma components at a time, so we don't waste processing since CbCr are decimated this way.
    // if we move to texture we could read 4 luminance values

    uint yuv101010Pel[2];

    yuv101010Pel[0] = (srcImage[y * nSourcePitch + x]) << 2;
    yuv101010Pel[1] = (srcImage[y * nSourcePitch + x + 1]) << 2;

    const size_t chromaOffset = nSourcePitch * height;

    const int y_chroma = y >> 1;

    if (y & 1)  // odd scanline ?
    {
        uint chromaCb = srcImage[chromaOffset + y_chroma * nSourcePitch + x];
        uint chromaCr = srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1];

        if (y_chroma < ((height >> 1) - 1)) // interpolate chroma vertically
        {
            chromaCb = (chromaCb + srcImage[chromaOffset + (y_chroma + 1) * nSourcePitch + x] + 1) >> 1;
            chromaCr = (chromaCr + srcImage[chromaOffset + (y_chroma + 1) * nSourcePitch + x + 1] + 1) >> 1;
        }

        yuv101010Pel[0] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
        yuv101010Pel[0] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

        yuv101010Pel[1] |= (chromaCb << (COLOR_COMPONENT_BIT_SIZE + 2));
        yuv101010Pel[1] |= (chromaCr << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
    }
    else
    {
        yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x] << (COLOR_COMPONENT_BIT_SIZE + 2));
        yuv101010Pel[0] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));

        yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x] << (COLOR_COMPONENT_BIT_SIZE + 2));
        yuv101010Pel[1] |= ((uint)srcImage[chromaOffset + y_chroma * nSourcePitch + x + 1] << ((COLOR_COMPONENT_BIT_SIZE << 1) + 2));
    }

    // this steps performs the color conversion
    uint yuvi[6];
    float red[2], green[2], blue[2];

    yuvi[0] = (yuv101010Pel[0] & COLOR_COMPONENT_MASK);
    yuvi[1] = ((yuv101010Pel[0] >> COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
    yuvi[2] = ((yuv101010Pel[0] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);

    yuvi[3] = (yuv101010Pel[1] & COLOR_COMPONENT_MASK);
    yuvi[4] = ((yuv101010Pel[1] >> COLOR_COMPONENT_BIT_SIZE)       & COLOR_COMPONENT_MASK);
    yuvi[5] = ((yuv101010Pel[1] >> (COLOR_COMPONENT_BIT_SIZE << 1)) & COLOR_COMPONENT_MASK);

    // YUV to RGB Transformation conversion
    YUV2RGB(&yuvi[0], &red[0], &green[0], &blue[0]);
    YUV2RGB(&yuvi[3], &red[1], &green[1], &blue[1]);

    // Clamp the results to RGBA

    const size_t dstImagePitch = nDestPitch >> 2;

    dstImage[y * dstImagePitch + x] = RGBA_pack_10bit(red[0], green[0], blue[0], ((uint)0xff << 24));
    dstImage[y * dstImagePitch + x + 1] = RGBA_pack_10bit(red[1], green[1], blue[1], ((uint)0xff << 24));
}


void nv12_to_rgb32(uint8_t* yuv, uint8_t* data, uint32_t yuv_pitch, uint32_t data_pitch, int w, int h)
{
    uint8_t* device_yuv;
    uint32_t* device_data;
    uint32_t  yuv_size;
    uint32_t  data_size;
    yuv_size = w * h * 3 / 2;
    data_size = w * 4 * h;
    if (yuv_size == 0)
        return;
    cudasafe(cudaMalloc((void **)&device_yuv, yuv_size), "Original image allocation ", __FILE__, __LINE__);
    cudasafe(cudaMemcpy(device_yuv, yuv, yuv_size, cudaMemcpyHostToDevice), "Copy original image to device ", __FILE__, __LINE__);
    cudasafe(cudaMalloc((void **)&device_data, data_size), "Original image allocation ", __FILE__, __LINE__);

    //dim3 block(32, 16, 1);
    //dim3 grid((w + (2 * block.x - 1)) / (2 * block.x), (h + (block.y - 1)) / block.y, 1);

    dim3 block(32, 8);
    dim3 grid(divUp(w, 2 * block.x), divUp(h, block.y));
    NV12_to_RGB << <grid, block >> > (device_yuv, yuv_pitch, device_data, (size_t)data_pitch, w, h);
    cudasafe(cudaMemcpy(data, device_data, data_size, cudaMemcpyDeviceToHost), "from device to host", __FILE__, __LINE__);
    cudaFree(device_yuv);
    cudaFree(device_data);
}

以上代码只是nv12转rgb,稍作修改可以实现nv21转rgb、yv12转rgb,或者其他yuv格式。
测试结果,gtx 1050 显卡,i7-7700cpu,1920x1080,1500fps,不包括从内存拷贝数据到GPU的时间。

结论:采用GPU可以实现更快的转换速度,但是从内存将yuv数据拷贝到显存差不多1ms,然后将结构拷贝到内存仍然需要1ms,所以大部分实现消耗在IO上,这样计算下来也就500fps左右,使用mmx可以达到600fps,但是mmx的寄存器也不多,对多个线程同事用mmx,可能会存在一些问题。二者都可以实现更高的转换效率,至于其中选择,需要看具体需求。

评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值