tensorrt框架 C++ yolov8 视频检测 GPU后处理

使用原始v8生成engine,采取tensorrt框架推理,输入输出维度保持不变,GPU上完成所有后处理过程,相比CPU过程耗时为原来1/10

环境说明

测试环境为windows

显卡 3060ti

tensorrt:8.5.3.1

cuda:11.7

opencv 4.8.0

关于环境安装,各种帖子搜一下很多

权重准备

engine与是否量化生成参考我的另一篇文章onnx转engine工具(包含量化) python脚本_lprnet python onnx转engine-CSDN博客

整体思路及核心代码

推理代码参考GitHub - wang-xinyu/tensorrtx: Implementation of popular deep learning networks with TensorRT network definition API

大佬的源码在生成engine的时候加入了一个插件,相当于先做了一部分后处理,输入维度3*640*640与原始一致,但是输出维度是6001*1,这与原始v8输出84*8400不同,想直接原始的权重有些障碍,需要先生成特定的engine, 如果是自定义的网络,序列化的难度就比较大。这篇文章在大佬的基础上做了修改,使用原始的输出作为输入得到box信息,所有的后处理部分均在cuda上完成。思路很简单:

模型输出84*8400先做转置,置信度阈值过滤, IoU阈值过滤, 经过后处理的结果为一个向量, 第一个元素记录box个数,第二到八记录第一个box信息left, top, right, bottom, conf, id, flag之后为第二个box, flag表示这个box是否要画在原图上

GPU上实现所有后处理过程,postprocess.cu:




__global__ void filter_kernel(
	float* predict, int num_bboxes, int num_classes, float confidence_threshold,
    float *parray, int max_objects, int NUM_BOX_ELEMENT
)
{
	int position = threadIdx.x + blockIdx.x * blockDim.x;// 计算线程绝对位置

	if( position >= num_bboxes){return;}; // 多余开启的线程直接结束

	float* pitem = predict + position * (4 + num_classes);// 计算首行指针
	float* class_confidence = pitem + 4; // 分类信息开始的指针位置
    float confidence = *class_confidence++; // 解读指针得到置信度信息,并指针位置前进1
    int label = 0; // 定义置信度索引位置
    for(int i = 1; i < num_classes; ++i, ++class_confidence) //每次循环指针自增,循环所有置信度,得到最大值及索引
	{ 
        if(*class_confidence > confidence){
            confidence = *class_confidence;
            label = i;
        }
    };
	if(confidence < confidence_threshold){return;};
       
	float cx = *pitem++;
	float cy = *pitem++;
	float w = *pitem++;
	float h = *pitem++;
	float left = cx - 0.5f * w;
	float top = cy - 0.5f * h;
	float right = cx + 0.5f * w;
	float bottom = cy + 0.5f * h;

	int index = atomicAdd(parray, 1);
    if(index >= max_objects){return;};
		
	
	// left, top, right, bottom, confidence, class, keepflag
    float *pout_item = parray + 1 + index * NUM_BOX_ELEMENT; 
    *pout_item++ = left;
    *pout_item++ = top;
    *pout_item++ = right;
    *pout_item++ = bottom;
    *pout_item++ = confidence;
    *pout_item++ = label;
    *pout_item++ = 1; // 1 = keep, 0 = ignore

}

static __device__ float box_iou(
    float aleft, float atop, float aright, float abottom,
    float bleft, float btop, float bright, float bbottom)
{

    float cleft = max(aleft, bleft);
    float ctop = max(atop, btop);
    float cright = min(aright, bright);
    float cbottom = min(abottom, bbottom);

    float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
    if (c_area == 0.0f){ return 0.0f;};
       
    float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
    float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
    return c_area / (a_area + b_area - c_area);
}

__global__ void fast_nms_kernel(float *bboxes, int max_objects, float threshold, int NUM_BOX_ELEMENT)
{
	int position = threadIdx.x + blockIdx.x * blockDim.x;// 计算线程绝对位置
	int count = min((int)*bboxes, max_objects);// 计算保存的框总数与定义的最大框的最小值,即框的实际数量
	if(position >= count){return;}; // 多余开启的线程直接结束
	float *pcurrent = bboxes + 1  + position * NUM_BOX_ELEMENT; //计算bboxes上的当前指针, 表示当前框
	
	// left, top, right, bottom, confidence, class, keepflag
    // 整体思想是每一个框和其他同种类的框做计算,比较IOU阈值,大于阈值的,当前框标记为0
    for(int i = 0; i < count; ++i)
	{
        float* pitem = bboxes + 1 + i * NUM_BOX_ELEMENT; //计算指针, 表示要比较的框
        if(i == position || pcurrent[5] != pitem[5]){continue;}; // 不同种类的不参与比较

        if(pitem[4] >= pcurrent[4]) 
		{
            if(pitem[4] == pcurrent[4] && i < position) {continue;};// 置信度相同,取索引小的
                
            float iou = box_iou(
                pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3],
                pitem[0],    pitem[1],    pitem[2],    pitem[3]
            );

            if(iou > threshold) // 如果找到比当前框大的比较框,且IOU大于阈值,则当前狂标记为0
			{
                pcurrent[6] = 0;  // 1=keep, 0=ignore
                return;
            }
        }
    }
} 



__global__ void transposeKernel(float* input, float* output, int width, int height) {
    // 线程的线性索引
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    
    // 确保线程在矩阵边界内
    if (idx < width * height) {
        int row = idx / width;
        int col = idx % width;
        
        // 转置后的索引
        int new_row = col;
        int new_col = row;
        
        // 计算线性索引
        output[new_row * height + new_col] = input[row * width + col];
    }
}


// predict:推理后结果指针,trans_row:转置后高度,class_num:分类数量,confidence_threshold:置信度阈值,nms_threshold:IOU阈值
// parray:后处理最终结果输出指针,max_objects:最大目标框数,NUM_BOX_ELEMENT:结果向量元素数量,stream:流
// ori_width:转置前宽度,ori_width:转置前高度,trans_predict:转置后结果指针
void postprocess(
    float* predict,  int trans_row, int class_num, float confidence_threshold, 
    float nms_threshold, float* parray, int max_objects, int NUM_BOX_ELEMENT,
    cudaStream_t stream, int ori_width, int ori_height, float* trans_predict){

    int numThreads = 1024;  
	int numBlocks = (ori_width * ori_height + numThreads - 1) / numThreads;

	transposeKernel<<<numBlocks, numThreads, 0, stream>>>(predict, trans_predict, ori_width, ori_height);


    auto block = trans_row > 512 ? 512 : trans_row;
    auto grid = (trans_row + block - 1) / block;

    filter_kernel<<<grid, block, 0, stream>>>(
       trans_predict, trans_row, class_num, confidence_threshold, parray, max_objects, NUM_BOX_ELEMENT
    );


    block = max_objects > 512 ? 512 : max_objects;
    grid = (max_objects + block - 1) / block;
    fast_nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold, NUM_BOX_ELEMENT);
}

结果展示

这里做了cpu与gpu推理时间的比较,推理时间包含前处理,推理,后处理,画图,显卡为3060ti

整个推理框架使用的是tensorrt, 前处理及推理在GPU完成,画图在CPU完成,比较了CPU/GPU后处理的耗时差异。在后处理阶段,CPU串行的循环耗时严重。

要在Ubuntu上使用TensorRT C++Yolov5,您需要遵循以下步骤: 1. 安装CUDA和cuDNN:TensorRT需要CUDA和cuDNN支持。您可以从NVIDIA官网下载CUDA和cuDNN的安装程序,并按照说明进行安装。 2. 下载Yolov5:您可以从Yolov5的GitHub页面上下载源代码,并按照说明进行编译。 3. 安装TensorRT:您可以从NVIDIA官网下载TensorRT的安装程序,并按照说明进行安装。 4. 创建TensorRT引擎:使用TensorRT API创建TensorRT引擎,加载Yolov5模型并优化它以提高推理性能。 5. 进行推理:使用TensorRT引擎进行推理,输入图像并输出检测结果。 以下是一个简单的示例代码,演示如何使用TensorRT C++Yolov5进行目标检测: ```c++ #include <iostream> #include <fstream> #include <sstream> #include "NvInfer.h" #include "NvOnnxParser.h" using namespace std; using namespace nvinfer1; using namespace nvonnxparser; int main() { // Load Yolov5 model const char* onnxModelPath = "yolov5.onnx"; IBuilder* builder = createInferBuilder(gLogger); INetworkDefinition* network = builder->createNetwork(); IParser* parser = createParser(*network, gLogger); parser->parseFromFile(onnxModelPath, 1); builder->setMaxBatchSize(1); builder->setMaxWorkspaceSize(1 << 30); ICudaEngine* engine = builder->buildCudaEngine(*network); parser->destroy(); // Create execution context IExecutionContext* context = engine->createExecutionContext(); // Load input image const char* imagePath = "input.jpg"; cv::Mat image = cv::imread(imagePath); // Preprocess input image cv::Mat resizedImage; cv::resize(image, resizedImage, cv::Size(640, 640)); cv::Mat floatImage; resizedImage.convertTo(floatImage, CV_32F, 1.0 / 255.0); float* inputData = (float*) malloc(640 * 640 * 3 * sizeof(float)); memcpy(inputData, floatImage.data, 640 * 640 * 3 * sizeof(float)); // Allocate GPU memory for input and output const int inputIndex = engine->getBindingIndex("input_0"); const int outputIndex = engine->getBindingIndex("output_0"); void* inputDeviceBuffer; cudaMalloc(&inputDeviceBuffer, 640 * 640 * 3 * sizeof(float)); void* outputDeviceBuffer; cudaMalloc(&outputDeviceBuffer, 25200 * 85 * sizeof(float)); // Copy input to GPU memory cudaMemcpy(inputDeviceBuffer, inputData, 640 * 640 * 3 * sizeof(float), cudaMemcpyHostToDevice); // Run inference void* inferenceBuffers[] = {inputDeviceBuffer, outputDeviceBuffer}; context->executeV2(inferenceBuffers); // Copy output from GPU memory float* outputData = (float*) malloc(25200 * 85 * sizeof(float)); cudaMemcpy(outputData, outputDeviceBuffer, 25200 * 85 * sizeof(float), cudaMemcpyDeviceToHost); // Postprocess output const float scoreThreshold = 0.5; const float iouThreshold = 0.5; const int numAnchors = 3; const int numClasses = 80; const int numBoxes = 25200; vector<vector<float>> detections; for (int i = 0; i < numBoxes; i++) { float* boxData = outputData + i * 85; float objectness = sigmoid(boxData[4]); int classId = 0; float maxClassScore = 0.0; for (int j = 5; j < 85; j++) { float classScore = sigmoid(boxData[j]) * objectness; if (classScore > maxClassScore) { classId = j - 5; maxClassScore = classScore; } } if (maxClassScore > scoreThreshold) { float x = sigmoid(boxData[0]) * 32 + (i % 84) * 8; float y = sigmoid(boxData[1]) * 32 + (i / 84) * 8; float w = exp(boxData[2]) * numAnchors * 32; float h = exp(boxData[3]) * numAnchors * 32; float left = x - w / 2.0; float top = y - h / 2.0; float right = x + w / 2.0; float bottom = y + h / 2.0; vector<float> detection = {classId, maxClassScore, left, top, right, bottom}; detections.push_back(detection); } } vector<int> keepIndices = nms(detections, iouThreshold); for (int i : keepIndices) { vector<float> detection = detections[i]; int classId = detection[0]; float score = detection[1]; float left = detection[2]; float top = detection[3]; float right = detection[4]; float bottom = detection[5]; cout << "Class: " << classId << ", Score: " << score << ", Left: " << left << ", Top: " << top << ", Right: " << right << ", Bottom: " << bottom << endl; } // Free memory free(inputData); free(outputData); cudaFree(inputDeviceBuffer); cudaFree(outputDeviceBuffer); context->destroy(); engine->destroy(); network->destroy(); builder->destroy(); return 0; } ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值