关于YU12、NV12 和 NV21

YUV 是一种颜色空间,用于描述彩色图像的亮度和色度信息。YUV420 是 YUV 中最常用的一种格式,其中 420 表示每四个 Y 像素共用一个 UV 像素。YUV420 格式又可以分为 YUV420P、YUV420SP 和 YUV420P10 等多种子格式。而 YU12、NV12 和 NV21 就是 YUV420 的三种子格式,具体如下:

YU12 格式:YU12 是 YUV420P 格式的一种。在 YU12 格式中,图像的亮度(Y)分量存储在一个矩阵中,而色度(U、V)分量则交替存储在另一个矩阵中,先存储 U 分量,再存储 V 分量,且每个分量占用一个字节。

NV12 格式:NV12 是 YUV420SP 格式的一种,也是最常用的一种。在 NV12 格式中,图像的亮度(Y)分量仍然存储在一个矩阵中,而色度(U、V)分量则交替存储在另一个矩阵中,不同的是,先存储 U 分量,然后是 V 分量,且每个像素的 U 和 V 分量交替存储,也就是每两个像素共用一个 UV 像素,每个像素的 U 和 V 分量各占用一个字节。

NV21 格式:NV21 也是 YUV420SP 格式的一种,与 NV12 的区别在于,在 NV21 格式中,先存储 V 分量,然后是 U 分量,也是每个像素的 U 和 V 分量交替存储,每个像素的 U 和 V 分量各占用一个字节。



/// @brief Convert YUV(yu12, also called i420) image to RGB image.
///        The order of RGB image is RGBRGBRGB. 这里仍然是HWC格式
/// @param in_buf The input buffer allocate in device memory.
///               The size is (in_w * in_h * 3 / 2) * sizeof(uint8_t).
/// @param out_buf The output buffer allocate in device memory.
///               The size is (in_w * in_h * 3) * sizeof(uint8_t).
/// @param in_w The width of input image.
/// @param in_h The height of input image.
/// @param stream CU kernel run in the stream.
void YUVYu12ToRGB(uint8_t* in_buf, uint8_t* out_buf,
                       int in_w, int in_h, cudaStream_t stream);
void YUVYu12ToRGB(uint8_t* in_buf, uint8_t* out_buf,
                     int in_w, int in_h, cudaStream_t stream){
  if (in_w % 4 == 0) {
      dim3 block(32, 4, 1);
      dim3 grid(((in_w >> 2) + block.x - 1) / block.x, ((in_h >> 1) + block.y - 1) / block.y, 1);
      Yuv2rgb24<true><<<grid, block, 0, stream>>>((uint32_t*)in_buf, (void*)out_buf, in_h >> 1, in_w >> 2);
  } else {
      dim3 block(256, 1, 1);
      dim3 grid(in_h / 2, 1, 1);
      int sm_size = in_w * 6 + 16 * 3;
      Yuv2rgb24_general<true, false, uint8_t, false><<<grid, block, sm_size, stream>>>(in_buf, out_buf, in_h, in_w);


template<bool IsYU12, bool float_out = false>
__global__ void Yuv2rgb24(uint32_t *__restrict__ in, void *__restrict__ out, int32_t height, int width) {
  uint32_t *u8_out = NULL;
  float2 *fp32_out = NULL;
  if (float_out)
    fp32_out = (float2*)out;
    u8_out = (uint32_t*)out;
  uint32_t w_id = blockDim.x * blockIdx.x + threadIdx.x;
  uint32_t h_id = blockDim.y * blockIdx.y + threadIdx.y;
  uint32_t out_idx = h_id * width + w_id;
  uint32_t thread_idx = threadIdx.y * blockDim.x + threadIdx.x;
  __shared__ uint32_t out_sm[768]; // 128 * 4 * 3 / 2

  using dTy = uint16_t; //or int8_t
  if (w_id < width && h_id < height) {
    dTy u1, v1, u2, v2;
    uint32_t temp_y1, temp_y2;
    uchar4 temp_rgb0[4],temp_rgb1[4];
    uint32_t pos = h_id * 2 * width + w_id;
    temp_y1 = in[pos];
    pos  = (h_id * 2 + 1) * width + w_id;
    temp_y2 = in[pos];
    if (IsYU12) {
      pos = height * width * 4 + out_idx;
      uint16_t u12 = ((uint16_t*)in)[pos];
      pos += height * width;
      uint16_t v12 = ((uint16_t*)in)[pos];
      u1 = ((uint8_t*)&u12)[0];
      u2 = ((uint8_t*)&u12)[1];
      v1 = ((uint8_t*)&v12)[0];
      v2 = ((uint8_t*)&v12)[1];
    } else {
      pos = height * width * 2 + out_idx;
      uint32_t uv = in[pos];
      u1 = ((uint8_t*)&uv)[0];
      v1 = ((uint8_t*)&uv)[1];
      u2 = ((uint8_t*)&uv)[2];
      v2 = ((uint8_t*)&uv)[3];
    dTy y1 = ((uint8_t*)&temp_y1)[0];
    dTy y2 = ((uint8_t*)&temp_y1)[1];
    dTy y3 = ((uint8_t*)&temp_y1)[2];
    dTy y4 = ((uint8_t*)&temp_y1)[3];
    dTy y5 = ((uint8_t*)&temp_y2)[0];
    dTy y6 = ((uint8_t*)&temp_y2)[1];
    dTy y7 = ((uint8_t*)&temp_y2)[2];
    dTy y8 = ((uint8_t*)&temp_y2)[3];
    uint8_t *out_row1 = (uint8_t*)out_sm;
    uint8_t *out_row2 = out_row1 + 1536;

    temp_rgb0[0] = cvt2rgb<dTy>(y1,u1,v1);
    temp_rgb0[1] = cvt2rgb<dTy>(y2,u1,v1);
    temp_rgb0[2] = cvt2rgb<dTy>(y3,u2,v2);
    temp_rgb0[3] = cvt2rgb<dTy>(y4,u2,v2);
    temp_rgb1[0] = cvt2rgb<dTy>(y5,u1,v1);
    temp_rgb1[1] = cvt2rgb<dTy>(y6,u1,v1);
    temp_rgb1[2] = cvt2rgb<dTy>(y7,u2,v2); 
    temp_rgb1[3] = cvt2rgb<dTy>(y8,u2,v2); 
    for(int i = 0; i < 4; i++)
      out_row1[thread_idx * 12 + i*3 + 0] = temp_rgb0[i].x;
      out_row1[thread_idx * 12 + i*3 + 1] = temp_rgb0[i].y;
      out_row1[thread_idx * 12 + i*3 + 2] = temp_rgb0[i].z;
      out_row2[thread_idx * 12 + i*3 + 0] = temp_rgb1[i].x;
      out_row2[thread_idx * 12 + i*3 + 1] = temp_rgb1[i].y;
      out_row2[thread_idx * 12 + i*3 + 2] = temp_rgb1[i].z;

    thread_idx = threadIdx.x;
    int num_loops = float_out ? 6 : 3;
    int out_offset1 = h_id * 2 * width * num_loops + blockDim.x * blockIdx.x * num_loops;
    int out_offset2 = (h_id * 2 + 1) * width * num_loops + blockDim.x * blockIdx.x * num_loops;
    int sm_offset1 = threadIdx.y * blockDim.x * num_loops;
    int sm_offset2 = sm_offset1 + 128 * num_loops;
    int threads = blockDim.x * (blockIdx.x + 1) <= width ? blockDim.x : (width - blockDim.x * blockIdx.x);
    for (int i=0; i<num_loops; i++) {
      if (float_out) {
        uchar2 tmp1, tmp2;
        tmp1 = *((uchar2 *)out_sm + sm_offset1 + thread_idx);
        tmp2 = *((uchar2 *)out_sm + sm_offset2 + thread_idx);
        fp32_out[out_offset1 + thread_idx].x = tmp1.x;
        fp32_out[out_offset1 + thread_idx].y = tmp1.y;

        fp32_out[out_offset2 + thread_idx].x = tmp2.x;
        fp32_out[out_offset2 + thread_idx].y = tmp2.y;
      } else {
        u8_out[out_offset1 + thread_idx] = out_sm[sm_offset1 + thread_idx];
        u8_out[out_offset2 + thread_idx] = out_sm[sm_offset2 + thread_idx];
      thread_idx += threads;


template<bool IsYU12, bool float_out, typename OUT, bool plane>
__global__ void Yuv2rgb24_general(uint8_t *__restrict__ in, OUT *__restrict__ out, int32_t h, int w) {
    extern __shared__ uint8_t sm[];
    uint8_t *y = sm;
    uint8_t *out_tmp = NULL;
    int h_idx = blockIdx.x;
    int offset = global2share_copy(in + h_idx * w * 2, y, w * 2);
    y += offset;
    int16_t y1, y2, u1, v1;
    uint8_t *u, *v, *uv;
    if (IsYU12) {
        u = y + w * 2 / 8 * 8 + 16;
        v = u + w / 2 / 8 * 8 + 16;
        out_tmp = v + w / 2 / 8 * 8 + 16;
        offset = global2share_copy(in + w * h + h_idx * w / 2, u, w >> 1);
        u += offset;
        offset = global2share_copy(in + int(w * h * 1.25f) + h_idx * w / 2, v, w >> 1);
        v += offset;
    } else {
        uv = y + w * 2 / 8 * 8 + 16;
        out_tmp = uv + w / 8 * 8 + 16;
        offset = global2share_copy(in + w * h + h_idx * w, uv, w);
        uv += offset;
    OUT *out_ptr = NULL;
    if (plane) {
        out_ptr = out + (h_idx * 2 + 0) * w;
    } else {
        out_ptr = out + (h_idx * 2 + 0) * w * 3;
    uint8_t front = 0;
    if (!float_out && !plane) front = (8 - ((uint64_t)out_ptr & 7)) & 7;
    for (int i = threadIdx.x; i < w / 2; i += blockDim.x) {
        if (IsYU12) {
            u1 = u[i];
            v1 = v[i];
        } else {
            u1 = uv[i * 2 + 0];
            v1 = uv[i * 2 + 1];
        y1 = y[i * 2 + 0];
        y2 = y[i * 2 + 1];
        uchar4 a = cvt2rgb<int16_t>(y1,u1,v1);
        uchar4 b = cvt2rgb<int16_t>(y2,u1,v1);
        offset = i * 6;
        if (!float_out && !plane) offset += 8 - front;
        out_tmp[offset + 0] = a.x;
        out_tmp[offset + 1] = a.y;
        out_tmp[offset + 2] = a.z;
        out_tmp[offset + 3] = b.x;
        out_tmp[offset + 4] = b.y;
        out_tmp[offset + 5] = b.z;
    if (float_out) {
        for (int i = threadIdx.x; i < w * 3; i += blockDim.x) {
            out_ptr[i] = out_tmp[i];
    } else {
        if (plane) {
            for (int i = threadIdx.x; i < w; i += blockDim.x) {
                out_ptr[i] = out_tmp[i * 3 + 0];
                out_ptr[i + h * w] = out_tmp[i * 3 + 1];
                out_ptr[i + h * w * 2] = out_tmp[i * 3 + 2];
        } else {
            share2global_copy(out_tmp, (uint8_t*)out_ptr, w * 3, front);
    if (plane) {
        out_ptr = out + (h_idx * 2 + 1) * w;
    } else {
        out_ptr = out + (h_idx * 2 + 1) * w * 3;
    if (!float_out && !plane) front = (8 - ((uint64_t)out_ptr & 7)) & 7;
    for (int i = threadIdx.x; i < w / 2; i += blockDim.x) {
        if (IsYU12) {
            u1 = u[i];
            v1 = v[i];
        } else {
            u1 = uv[i * 2 + 0];
            v1 = uv[i * 2 + 1];
        y1 = y[w + i * 2 + 0];
        y2 = y[w + i * 2 + 1];
        uchar4 a = cvt2rgb<int16_t>(y1,u1,v1);
        uchar4 b = cvt2rgb<int16_t>(y2,u1,v1);
        offset = i * 6;
        if (!float_out && !plane) offset += 8 - front;
        out_tmp[offset + 0] = a.x;
        out_tmp[offset + 1] = a.y;
        out_tmp[offset + 2] = a.z;
        out_tmp[offset + 3] = b.x;
        out_tmp[offset + 4] = b.y;
        out_tmp[offset + 5] = b.z;
    if (float_out) {
        for (int i = threadIdx.x; i < w * 3; i += blockDim.x) {
            out_ptr[i] = out_tmp[i];
    } else {
        if (plane) {
            for (int i = threadIdx.x; i < w; i += blockDim.x) {
                out_ptr[i] = out_tmp[i * 3 + 0];
                out_ptr[i + h * w] = out_tmp[i * 3 + 1];
                out_ptr[i + h * w * 2] = out_tmp[i * 3 + 2];
        } else {
            share2global_copy(out_tmp, (uint8_t*)out_ptr, w * 3, front);




