项目中的模型一直都是直接操作NV12的yuv格式数据,这次的模型只支持RGB格式的输入,正好来自己实现对应的算子。
这里记录一下对应算子的实现过程,主要涉及到NV12到RGB的变换,RGB的crop/resize操作,对于数据的Norm/ToFloat操作,调整Layout等等。
cu文件是要nvcc来进行编译的,但是其头文件可以供外部的cpp文件调用,另外这里的核函数并没有涉及到stream的考虑,因为这个涉及到之后的性能优化环节,要有先来后到。实际stream也就是在核函数调用前的<<<>>>中传入stream而已,然后之后要跟着同步stream的操作。与函数实现逻辑无关。
cuda_transformation.cu
在这里实现真正的核函数,
NV12toRGB
这里的坑点在于 BT.601/709 FULL/非FULL的yuv格式,如果出了差错会导致图像看起来色度不对,遇到过的问题就是红色很不明显,原因就是转换公式写的有问题。
__global__ void NV12toRGB(uint8_t *yuv, uint8_t *rgb, int width,
int height) {
const int nv_start = width * height;
int i, j, nv_index = 0;
uint8_t y, u, v;
int r, g, b;
j = blockIdx.x * blockDim.x + threadIdx.x;
i = blockIdx.y * blockDim.y + threadIdx.y;
if (i >= height || j >= width)
return;
nv_index = i / 2 * width + j - j % 2;
int rgb_index = i * width + j;
y = yuv[rgb_index];
u = yuv[nv_start + nv_index];
v = yuv[nv_start + nv_index + 1];
r = y + (140 * (v - 128)) / 100; // r
g = y - (34 * (u - 128)) / 100 - (71 * (v - 128)) / 100; // g
b = y + (177 * (u - 128)) / 100; // b
if (r > 255)
r = 255;
if (g > 255)
g = 255;
if (b > 255)
b = 255;
if (r < 0)
r = 0;
if (g < 0)
g = 0;
if (b < 0)
b = 0;
rgb[rgb_index * 3 + 0] = b;
rgb[rgb_index * 3 + 1] = g;
rgb[rgb_index * 3 + 2] = r;
}
int cudaNV12toRGB(uint8_t *input, uint8_t *output, size_t width,
size_t height) {
if (!input || !output)
return cudaErrorInvalidDevicePointer;
const dim3 blockDim(32, 32, 1);
const dim3 gridDim((width + blockDim.x - 1) / blockDim.x,
(height + blockDim.y - 1) / blockDim.y, 1);
NV12toRGB<<<gridDim, blockDim>>>(input, output, width, height);
return cudaDeviceSynchronize();
}
RGBBilinearResize
__global__ void RGBBilinearResize(uint8_t *input, uint8_t *output,
int inputWidth, int inputHeight,
int outputWidth, int outputHeight) {
// 计算线程的全局索引
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= outputWidth || y >= outputHeight)
return;
// gx,gy是相对于resize后的图中的点,这里计算对应的原图中的浮点位置,确定要从哪里采样
float gx = ((float)x) / outputWidth * (inputWidth - 1);
float gy = ((float)y) / outputHeight * (inputHeight - 1);
// 对应的整数位置及其偏移量
int gxi = (int)gx;
int gyi = (int)gy;
float dx = gx - gxi;
float dy = gy - gyi;
// 读取四个最近的像素值
uint8_t topLeft[3] = {input[(gyi * inputWidth + gxi) * 3 + 0],
input[(gyi * inputWidth + gxi) * 3 + 1],
input[(gyi * inputWidth + gxi) * 3 + 2]};
uint8_t topRight[3] = {input[(gyi * inputWidth + gxi + 1) * 3 + 0],
input[(gyi * inputWidth + gxi + 1) * 3 + 1],
input[(gyi * inputWidth + gxi + 1) * 3 + 2]};
uint8_t bottomLeft[3] = {input[((gyi + 1) * inputWidth + gxi) * 3 + 0],
input[((gyi + 1) * inputWidth + gxi) * 3 + 1],
input[((gyi + 1) * inputWidth + gxi) * 3 + 2]};
uint8_t bottomRight[3] = {
input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 0],
input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 1],
input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 2]};
// 对每个通道进行双线性插值
for (int i = 0; i < 3; i++) {
float top = topLeft[i] * (1 - dx) + topRight[i] * dx;
float bottom = bottomLeft[i] * (1 - dx) + bottomRight[i] * dx;
output[(y * outputWidth + x) * 3 + i] = top * (1 - dy) + bottom * dy;
}
}
int cudaRGBBilinearResize(uint8_t *input, uint8_t *output, size_t width,
size_t height, size_t resize_width,
size_t resize_height) {
if (!input || !output)
return cudaErrorInvalidDevicePointer;
const dim3 blockDim(32, 32, 1);
const dim3 gridDim((width + blockDim.x - 1) / blockDim.x,
(height + blockDim.y - 1) / blockDim.y, 1);
RGBBilinearResize<<<gridDim, blockDim>>>(input, output, width, height,
resize_width, resize_height);
return cudaDeviceSynchronize();
}
RGBToFloat
这里的实现要额外记录下,因为涉及到debug中的opencv-dump所以在传入模型前的数据都是BGR格式的,在转浮点这里重新调整成模型需要的RGB格式。
__global__ void RGBToFloat(uint8_t *input, float *output, int width, int height) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width || y >= height) return;
int idx = y * width + x;
output[idx * 3 + 0] = input[idx * 3 + 2] / 255.0f; // R
output[idx * 3 + 1] = input[idx * 3 + 1] / 255.0f; // G
output[idx * 3 + 2] = input[idx * 3 + 0] / 255.0f; // B
}
int cudaRGBToFloat(uint8_t *input, float *output, int width, int height) {
dim3 blockDim(16, 16);
dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);
RGBToFloat<<<gridDim, blockDim>>>(input, output, width, height);
return cudaDeviceSynchronize();
}
RGBNormalize
__global__ void RGBNormalize(float *image, int width, int height, float mean[], float std[]) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width || y >= height) {
return;
}
int idx = y * width + x;
if (std[0] < 1e-6 || std[1] < 1e-6 || std[2] < 1e-6) {
printf("Error: std values are too small for safe division.\n");
return;
}
image[idx * 3 + 0] = (image[idx * 3 + 0] - mean[0]) / std[0]; // B
image[idx * 3 + 1] = (image[idx * 3 + 1] - mean[1]) / std[1]; // G
image[idx * 3 + 2] = (image[idx * 3 + 2] - mean[2]) / std[2]; // R
}
int cudaRGBNormalize(float *d_image, int width, int height, float mean[], float std[]) {
dim3 blockDim(16, 16);
dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);
RGBNormalize<<<gridDim, blockDim>>>(d_image, width, height, mean, std);
cudaError_t cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(cudaStatus));
return -1;
}
return 0;
}
HWC2CHW
template <typename T>
__global__ void HWC2CHW(const T* input, T* output, int height, int width) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width || y >= height) return;
int channelSize = width * height;
int hwcIndex = y * width + x;
int chwIndex;
for (int c = 0; c < 3; ++c) {
chwIndex = c * channelSize + y * width + x;
output[chwIndex] = input[hwcIndex * 3 + c];
}
}
template <typename T>
int cudaHWC2CHW(const T* input, T* output, int height, int width) {
dim3 blockDim(16, 16);
dim3 gridDim((width + blockDim.x - 1) / blockDim.x, (height + blockDim.y - 1) / blockDim.y);
HWC2CHW<<<gridDim, blockDim>>>(input, output, height, width);
return cudaDeviceSynchronize();
}
template int cudaHWC2CHW<float>(const float* input, float* output, int height, int width);
cuda_transformation.h
void convertNV12toYUV444withActions_cuda(uint8_t *src_img, uint8_t *src_imgcuda,
uint8_t *tmpImagecuda,
ImageTransParam &trans_param,
uint8_t *dst_imgcuda, uint8_t *dst_img,
cudaStream_t stream);
void convertNV12toYUV444withActions_cuda1(uint8_t *src_imgcuda,
ImageTransParam &trans_param,
uint8_t *dst_imgcuda);
int cudaNV12toRGB(uint8_t* input, uint8_t* output, size_t width, size_t height);
int cudaRGBBilinearResize(uint8_t *input, uint8_t *output, size_t width,
size_t height, size_t resize_width,
size_t resize_height);
int cudaRGBToFloat(uint8_t *input, float *output, int width, int height);
int cudaRGBNormalize(float *d_image, int width, int height, float mean[], float std[]);
template <typename T>
int cudaHWC2CHW(const T* input, T* output, int height, int width);
image_transformation.h
这里也是对该变换进行封装,虽然项目是面向对象的抽象出了类似Transformer这个类,但是出于逻辑清晰和方便调试,我这里提供的都是面向过程的代码,另外附上了cpu中算子的实现。实际上一个图像处理算子的实现,一般过程是先生成cpu的,基于NCHW的循环版本,再对其改装成gpu上的算子,毕竟gpu的算子调试相较cpu不是很方便。虽然有cuda-gdb这种东西。可以看到cpu和gpu的版本基本上只在循环方式上有差别,因此核函数也是可以称为 for_each_pixel_func
void TransformNV12toRGB(uint8_t *input, uint8_t *output,
int width, int height) {
int ret = cudaNV12toRGB(input, output, width, height);
if (ret != 0){
HSLOG_E << "cudaNV12toRGB FAILED";
}
}
void CpuTransformNV12toRGB(uint8_t *yuv, uint8_t *rgb,
int width, int height) {
const int nv_start = width * height;
uint32_t i, j, index = 0, rgb_index = 0;
uint8_t y, u, v;
int r, g, b, nv_index = 0;
for (i = 0; i < height; i++) {
for (j = 0; j < width; j++) {
// nv_index = (rgb_index / 2 - width / 2 * ((i + 1) / 2)) * 2;
nv_index = i / 2 * width + j - j % 2;
y = yuv[rgb_index];
u = yuv[nv_start + nv_index];
v = yuv[nv_start + nv_index + 1];
r = y + (140 * (v - 128)) / 100; // r
g = y - (34 * (u - 128)) / 100 - (71 * (v - 128)) / 100; // g
b = y + (177 * (u - 128)) / 100; // b
if (r > 255)
r = 255;
if (g > 255)
g = 255;
if (b > 255)
b = 255;
if (r < 0)
r = 0;
if (g < 0)
g = 0;
if (b < 0)
b = 0;
// index = rgb_index % width + (height - i - 1) * width;
index = rgb_index % width + i * width;
rgb[index * 3 + 0] = b;
rgb[index * 3 + 1] = g;
rgb[index * 3 + 2] = r;
rgb_index++;
}
}
}
void TransformRGBResize(uint8_t *input, uint8_t *output, size_t width,
size_t height, size_t resize_width,
size_t resize_height) {
int ret = cudaRGBBilinearResize(input, output, width, height, resize_width, resize_height);
if (ret != 0){
HSLOG_E << "cudaRGBBilinearResize FAILED: " << ret;
}
}
void CPURGBBilinearResize(uint8_t *input, uint8_t *output,
int inputWidth, int inputHeight, int outputWidth,
int outputHeight) {
for (int y = 0; y < outputHeight; y++) {
for (int x = 0; x < outputWidth; x++) {
// 计算对应的原图中的浮点位置
float gx = ((float)x) / outputWidth * (inputWidth - 1);
float gy = ((float)y) / outputHeight * (inputHeight - 1);
// 对应的整数位置及其偏移量
int gxi = (int)gx;
int gyi = (int)gy;
float dx = gx - gxi;
float dy = gy - gyi;
// 读取四个最近的像素值
uint8_t topLeft[3] = {input[(gyi * inputWidth + gxi) * 3 + 0],
input[(gyi * inputWidth + gxi) * 3 + 1],
input[(gyi * inputWidth + gxi) * 3 + 2]};
uint8_t topRight[3] = {
input[(gyi * inputWidth + gxi + 1) * 3 + 0],
input[(gyi * inputWidth + gxi + 1) * 3 + 1],
input[(gyi * inputWidth + gxi + 1) * 3 + 2]};
uint8_t bottomLeft[3] = {
input[((gyi + 1) * inputWidth + gxi) * 3 + 0],
input[((gyi + 1) * inputWidth + gxi) * 3 + 1],
input[((gyi + 1) * inputWidth + gxi) * 3 + 2]};
uint8_t bottomRight[3] = {
input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 0],
input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 1],
input[((gyi + 1) * inputWidth + gxi + 1) * 3 + 2]};
// 对每个通道进行双线性插值
for (int i = 0; i < 3; i++) {
float top = topLeft[i] * (1 - dx) + topRight[i] * dx;
float bottom = bottomLeft[i] * (1 - dx) + bottomRight[i] * dx;
output[(y * outputWidth + x) * 3 + i] =
static_cast<uint8_t>(top * (1 - dy) + bottom * dy);
}
}
}
}
void TransfromConvertRGBToFloat(uint8_t *input, float *output, int width, int height){
int ret = cudaRGBToFloat(input, output, width, height);
if (ret != 0){
HSLOG_E << "cudaRGBToFloat FAILED: " << ret;
}
}
void TransfromRGBNormalize(float *input, int width, int height, float* mean, float* std){
int ret = cudaRGBNormalize(input, width, height, mean, std);
if (ret != 0){
HSLOG_E << "cudaRGBNormalize FAILED: " << ret;
}
}
template <typename T>
int TransfromHWC2CHW(const T* input, T* output, int height, int width){
int ret = cudaHWC2CHW<T>(input, output, height, width);
if (ret != 0){
HSLOG_E << "cudaHWC2CHW FAILED: " << ret;
}
}
pre_process_module.cpp
这里额外加入一些dump的操作,以及debuggpu前N个字节的操作,方便调试
void PreProcessModule::Transform21dImage(hobot::dataflow::spMsgResourceProc proc,
const hobot::dataflow::MessageLists &msgs){
UNUSED(proc);
auto &input_img_batch_msgs = msgs[0];
std::shared_ptr<ImageBatchMsg<GPUImageMsg>> batch_image_msg =
std::static_pointer_cast<ImageBatchMsg<GPUImageMsg>>(
input_img_batch_msgs->at(0));
for (int i = 0; i < batch_image_msg->batch_size_; ++i) {
auto image_msg = batch_image_msg->batch_img_msg_[i];
int height = image_msg->img_trans_param_.src_height;
int width = image_msg->img_trans_param_.src_width;
image_transformation_[i].TransformNV12toRGB(image_msg->cuda_nv12_, image_transformation_[i].cuda_image_out_, width, height);
static int cnt = 0;
if (true)
{
std::string input_file_path= "/home/yuxuan03.zhang/utils_code/lcc/query/" + std::to_string(cnt) + ".jpg";
cv::Mat bgrImage = cv::imread(input_file_path);
if (bgrImage.empty()) {
std::cerr << "Error: Image cannot be loaded!" << std::endl;
}
size_t size = bgrImage.total() * bgrImage.elemSize(); // 计算需要复制的内存大小
HSLOG_E << "height: " << height << "width: " << width << "size: " << size << "file" << input_file_path;
// 将数据从 cv::Mat 复制到 GPU 内存
cudaMemcpy(image_transformation_[i].cuda_image_out_, bgrImage.ptr(), size, cudaMemcpyHostToDevice);
image_msg->SetDoneTimestamp(cnt);
cnt++;
}
// int size = width * height * 3 / 2;
// uint8_t* cpu_nv12 = new uint8_t[size];
// cudaMemcpy(cpu_nv12, image_msg->cuda_nv12_, size, cudaMemcpyDeviceToHost);
// cv::Mat nv12Img(height + height / 2, width, CV_8UC1, cpu_nv12);
// cv::Mat bgrImg;
// cv::cvtColor(nv12Img, bgrImg, cv::COLOR_YUV2BGR_NV12);
// std::string file = std::to_string(image_msg->GetGenTimestamp()) + "_nv12.png";
// cv::imwrite(file, bgrImg);
// delete[] cpu_nv12;
// int dataSize = width * height * 3;
// uint8_t* cpu_rgb = new uint8_t[dataSize];
// cudaMemcpy(cpu_rgb, image_transformation_[i].cuda_image_out_, dataSize, cudaMemcpyDeviceToHost);
// cv::Mat rgb_img(height, width, CV_8UC3, cpu_rgb);
// std::string file1 = std::to_string(image_msg->GetGenTimestamp()) + "_rgb.png";
// cv::imwrite(file1, rgb_img);
// delete[] cpu_rgb;
image_transformation_[i].TransformRGBResize(image_transformation_[i].cuda_image_out_, image_transformation_[i].cuda_image_trans_buffer_, width, height, 910, 512);
HSLOG_E <<"Resize: " << PrintFirstNUint8Bytes((uint8_t*)image_transformation_[i].cuda_image_trans_buffer_);
// uint8_t* cpu_rgb_resize = new uint8_t[910*512*3];
// cudaMemcpy(cpu_rgb_resize, image_transformation_[i].cuda_image_trans_buffer_, 910*512*3, cudaMemcpyDeviceToHost);
// cv::Mat rgb_resize_img(512, 910, CV_8UC3, cpu_rgb_resize);
// std::string file2 = std::to_string(image_msg->GetGenTimestamp()) + "_rgb_resize.png";
// cv::imwrite(file2, rgb_resize_img);
// delete[] cpu_rgb_resize;
image_transformation_[i].TransfromConvertRGBToFloat(image_transformation_[i].cuda_image_trans_buffer_, (float*)image_transformation_[i].cuda_image_out_, 910, 512);
HSLOG_E <<"BRGToRGBFloat: " << PrintFirstNFloatBytes((float*)image_transformation_[i].cuda_image_out_);
std::vector<float> mean = {0.485, 0.456, 0.406};
std::vector<float> std = {0.229, 0.224, 0.225};
float* mean_gpu = (float*)image_transformation_[i].cuda_image_trans_buffer_;
float* std_gpu = mean_gpu+3;
cudaMemcpy(mean_gpu, mean.data(), 3 * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(std_gpu, std.data(), 3 * sizeof(float), cudaMemcpyHostToDevice);
image_transformation_[i].TransfromRGBNormalize((float*)image_transformation_[i].cuda_image_out_, 910, 512, mean_gpu, std_gpu);
HSLOG_E <<"Norm: " << PrintFirstNFloatBytes((float*)image_transformation_[i].cuda_image_out_);
image_transformation_[i].TransfromHWC2CHW((float*)image_transformation_[i].cuda_image_out_, (float*)image_msg->cuda_yuv444_, 512, 910);
HSLOG_E <<"HWC2CHW: " << PrintFirstNFloatBytes((float*)image_msg->cuda_yuv444_);
if (true) {
float *cuda_image_out_ = (float*)image_msg->cuda_yuv444_;
size_t dataSize = 3 * 512 * 910 * sizeof(float);
float *hostData = new float[dataSize / sizeof(float)];
cudaMemcpy(hostData, cuda_image_out_, dataSize, cudaMemcpyDeviceToHost);
std::string input_file_path= "./dump_bin/" + std::to_string(cnt) + ".bin";
std::ofstream outFile(input_file_path, std::ios::out | std::ios::binary);
outFile.write(reinterpret_cast<char *>(hostData), dataSize);
outFile.close();
delete[] hostData;
}
}
SEND_DATA(SLOT_OUT_BATCH_TRANS_IMAGE, batch_image_msg);
}