C实现:
void rgb24_to_nv12(uint8_t* data, int w, int h, uint8_t* yuv)
{
uint32_t row_bytes;
uint8_t* uv;
uint8_t* y;
uint8_t r, g, b;
uint8_t y_val, u_val, v_val;
uint32_t off;
uint32_t off_uv;
uint8_t* img;
int i, j;
y = yuv;
uv = yuv + w * h;
off = 0;
row_bytes = (w * 3 + 3) & ~3;
h = h & ~1;
//先转换Y
for (i = 0; i < h; i++)
{
img = data + off;
for (j = 0; j < w; j++)
{
b = *img++;
g = *img++;
r = *img++;
y_val = (uint8_t)(((int)(30 * r) + (int)(59 * g) + (int)(11 * b)) / 100);
*y++ = y_val;
}
off += row_bytes;
}
//转换uv
off = 0;
for (i = 0; i < h; i += 2)
{
img = data + off;
for (j = 0; j < w; j+=2)
{
b = *img++;
g = *img++;
r = *img++;
img += 3;
u_val = (uint8_t)(((int)(-17 * r) - (int)(33 * g) + (int)(50 * b) + 12800) / 100);
v_val = (uint8_t)(((int)(50 * r) - (int)(42 * g) - (int)(8 * b) + 12800) / 100);
*uv++ = v_val;
*uv++ = u_val;
}
off += row_bytes;
off += row_bytes;
}
}
CUDA实现:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define MEMORY_ALGN_DEVICE 511
__host__ __device__ __forceinline__ int divUp(int total, int grain)
{
return (total + grain - 1) / grain;
}
__device__ __forceinline__ void rgb_to_y(const uint8_t b, const uint8_t g, const uint8_t r, uint8_t& y)
{
y = static_cast<uint8_t>(((int)(30 * r) + (int)(59 * g) + (int)(11 * b)) / 100);
}
__device__ __forceinline__ void rgb_to_yuv(const uint8_t b, const uint8_t g, const uint8_t r, uint8_t& y, uint8_t& u, uint8_t& v)
{
rgb_to_y(b, g, r, y);
u = static_cast<uint8_t>(((int)(-17 * r) - (int)(33 * g) + (int)(50 * b) + 12800) / 100);
v = static_cast<uint8_t>(((int)(50 * r) - (int)(42 * g) - (int)(8 * b) + 12800) / 100);
}
__global__ void RGB24_to_YV12_kernel(uint8_t* data,int w, int h, uint32_t row_bytes, uint8_t* y_data, uint8_t* u_data, uint8_t* v_data, uint32_t y_pitch, uint32_t uv_pitch)
{
const int x = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
const int y = (blockIdx.y * blockDim.y + threadIdx.y) * 2;
if (x + 1 >= w || y + 1 >= h)
return;
uint8_t r, g, b;
uint8_t y_val, u_val, v_val;
uint8_t* img;
//pixel at (x, y)
img = data + y * row_bytes + x * 3;
b = *img++;
g = *img++;
r = *img++;
rgb_to_y(b, g, r, y_val);
*(y_data + y * y_pitch + x) = y_val;
//pixel at (x+1, y)
b = *img++;
g = *img++;
r = *img++;
rgb_to_y(b, g, r, y_val);
*(y_data + y * y_pitch + x + 1) = y_val;
//pixel at (x, y+1)
img = data + (y+1) * row_bytes + x * 3;
b = *img++;
g = *img++;
r = *img++;
rgb_to_y(b, g, r, y_val);
*(y_data + (y+1) * y_pitch + x) = y_val;
//pixel at (x+1, y+1)
b = *img++;
g = *img++;
r = *img++;
rgb_to_yuv(b,g, r, y_val, u_val, v_val);
*(y_data + (y + 1) * y_pitch + x + 1) = y_val;
*(u_data + uv_pitch * y / 2 + x / 2) = u_val;
*(v_data + uv_pitch * y / 2 + x / 2) = v_val;
}
void RGB24_to_YV12(uint8_t* data, uint8_t* y, uint8_t* u, uint8_t* v, int w, int h)
{
uint32_t row_bytes;
uint32_t y_pitch;
uint32_t uv_pitch;
const dim3 block(32, 8);
const dim3 grid(divUp(w, block.x * 2), divUp(h, block.y * 2));
row_bytes = (w * 3 + 3) & ~3;
y_pitch = (w + MEMORY_ALGN_DEVICE) & ~MEMORY_ALGN_DEVICE;
uv_pitch = (w / 2 + MEMORY_ALGN_DEVICE) & ~MEMORY_ALGN_DEVICE;
RGB24_to_YV12_kernel << <grid, block >> >(data, w, h, row_bytes, y, u, v, y_pitch, uv_pitch);
cudaGetLastError();
cudaDeviceSynchronize();
}
CUDA实现参考opencv上的实现,转换的yuv格式是YV12.