【基于opencv-cuda的常见图像预处理】

最新推荐文章于 2024-09-22 17:02:28 发布

hello_dear_you

最新推荐文章于 2024-09-22 17:02:28 发布

阅读量1.9k

点赞数 3

分类专栏： # opencv 文章标签： opencv cuda 计算机视觉 cv c++

本文链接：https://blog.csdn.net/hello_dear_you/article/details/119863264

版权

opencv 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

本文介绍了如何使用OpenCV和CUDA来加速图像预处理步骤，特别是针对TensorRT推理阶段的图像输入。通过加载图像、转换、缩放、归一化等操作，实现了与PyTorch中相似的预处理流程，并利用CUDA优化了处理速度。代码示例展示了如何将处理后的数据直接用于TensorRT的GPU推理。

摘要由CSDN通过智能技术生成

文章目录

1. 常见图像预处理实现

2. opencv-cuda加速图像预处理

参考链接

1. 常见图像预处理实现

与网络训练过程中包含各种数据增强方法不同，在网络的推理阶段常见的图像预处理如下所示：


from PIL import Image
from torchvision import transforms as T

# 1. read image
img = Image.open("image-path.jpg").convert('RGB')       # CHW order

# 2. 图像预处理
# resize_size可以为int，也可以是tuple
transform = T.Compose([T.Resize(resize_size, Image.ANTIALIAS),
                        T.ToTensor(),
                        T.Normalize(mean=[0.485, 0.456, 0.406],
                                    std=[0.229, 0.224, 0.225])])

img = transform(img)

分析上述图像预处理过程：

Image.open("image-path".jpg).convert("RGB")

通过Image读取图像，得到的图像数据排列分布为CHW，且三通道的顺序为RGB

Resize(resize_size, Image.ANTIALIAS)

该操作指的是通过插值的方式缩放输入图像的大小以符合网络的输入尺寸，通常采用的插值方式为双线性插值

ToTensor()

该操作是将numpy的数据类型转换为pytorch的Tensor类型，此外还会将图像的像数值从【0，255】缩放到【0，1】，通过“像素值 / 255”完成

Normalize（）

该操作是将输入的数据进行标准化。公式如下：

2. opencv-cuda加速图像预处理


// -------------- opencv ----------------------- # 
#include <opencv2/opencv.hpp>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
// ---------------- opencv-cuda ---------------- #
#include <opencv2/cudawarping.hpp>
#include <opencv2/cudaarithm.hpp>
#include <opencv2/cudaimgproc.hpp>

// ------------ cuda ------------------------- #
#include <cuda_runtime_api.h>
// ------------------- nvinfer1 ------------------ # 
#include "NvInfer.h"

// ------------ standard libraries  --------------- # 
#include <iostream>
#include <assert.h>
#include <string>
#include <vector>

// ---------------------------------------------- #

void preprocessImage(const std::string& image_path, float* gpu_input,
                    nvinfer1::Dims3& dims)
{
    // read image
    cv::Mat frame = cv::imread(image_path);
    if(frame.empty())
    {
        std::cerr << "failed to load image: " << image_path << "!" << std::endl;
        return;
    }
    // upload
    cv::cuda::GpuMat gpu_frame;
    gpu_frame.upload(frame);

    // resize
    // CHW order
    auto input_width = dims.d[2];
    auto input_height = dims.d[1];
    auto channels = dims.d[0];
    
    auto input_size = cv::Size(input_width, input_height);
    cv::cuda::GpuMat resized;
    cv::cuda::resize(gpu_frame, resized, input_size, 0, 0, cv::INTER_LINEAR);

    //*  ------------------------ Pytorch ToTensor and Normalize ------------------- */
    cv::cuda::GpuMat flt_image;
    resized.convertTo(flt_image, CV_32FC3, 1.f/255.f);

    cv::cuda::subtract(flt_image, cv::Scalar(0.485f, 0.346f, 0.406f), flt_image,
                        cv::noArray(), -1);
    
    cv::cuda::divide(flt_image, cv::Scalar(0.229f, 0.224f, 0.225f), flt_image, 1, -1);
    //* ----------------------------------------------------------------------------------- /
    // BGR To RGB
    cv::cuda::GpuMat rgb;
    cv::cuda::cvtColor(flt_image, rgb, cv::COLOR_BGR2RGB);

    // toTensor(copy data to input float pointer channel by channel)
    std::vector<cv::cuda::GpuMat> rgb_out;
    for(size_t i=0; i<channels; ++i)
    {
        rgb_out.emplace_back(cv::cuda::GpuMat(cv::Size(input_width, input_height), CV_32FC1, gpu_input + i * input_width * input_height));
    }

    cv::cuda::split(flt_image, rgb_out); // opencv HWC order -> CHW order
}

// calculate size of tensor
size_t getSizeByDim(const nvinfer1::Dims& dims)
{
    size_t size = 1;
    for (size_t i = 0; i < dims.nbDims; ++i)
    {
        size *= dims.d[i];
    }
    return size;
}

int main()
{
    std::string image_path = "../00.jpg";
    // CHW order
    nvinfer1::Dims3 input_dim(3, 448, 448);

    auto input_size = getSizeByDim(input_dim) * sizeof(float);
    // allocate gpu memory for network inference
    // 此处的buffer可以认为是TensorRT engine推理时在GPU上分配的输入显存
    std::vector<void*> buffers(1);
    cudaMalloc(&buffers[0], input_size);

    // preprocess
    preprocessImage(image_path, (float*)buffers[0], input_dim);

    // download
    cv::cuda::GpuMat gpu_output;
    std::vector<cv::cuda::GpuMat> resized;
    for (size_t i = 0; i < 3; ++i)
    {
        resized.emplace_back(cv::cuda::GpuMat(cv::Size(input_dim.d[2], input_dim.d[1]), CV_32FC1, (float*)buffers[0] + i * input_dim.d[2] * input_dim.d[1]));
    }
    cv::cuda::merge(resized, gpu_output);

    cv::cuda::GpuMat image_out;
    // normalize
    gpu_output.convertTo(image_out, CV_32FC3, 1.f * 255.f);
    // download
    cv::Mat dst;
    image_out.download(dst);

    cv::imwrite("../01_test_demo.jpg", dst);

    for(void* buf:buffers)
    {
        cudaFree(buf);
    }

    return 0;
}

【注意】上述代码测试时，可以将substract和divide两个操作注释，通过观察输出的图像是否为resize之后的图像。

此外，还需要注意的是opencv在CPU和GPU上实现的resize操作有些许差别，调试程序时发现的。

参考链接

我想将OpenCV::cuda::GpuMat类型的数据给TensorRT GPU加速，请问有方法吗？ - 知乎 (zhihu.com)

How To Run Inference Using TensorRT C++ API | LearnOpenCV

learnopencv/PyTorch-ONNX-TensorRT-CPP at master · spmallick/learnopencv (github.com)