TensorRT笔记十（yolov5后处理）

最新推荐文章于 2024-09-29 23:56:18 发布

nanjono

最新推荐文章于 2024-09-29 23:56:18 发布

阅读量3.6k

点赞数 8

文章标签：笔记 YOLO

本文链接：https://blog.csdn.net/nanjono/article/details/131289568

版权

文章介绍了Yolov5模型的输出结构，重点讲述了如何进行CPU和GPU解码的后处理步骤，包括对象检测框的筛选、NMS非极大值抑制优化，以及在GPU上利用原子操作动态管理解码结果。同时，文章强调了避免多余计算和优化内存分配对提高性能的重要性。

摘要由CSDN通过智能技术生成

一、前言

yolov5的输出为tensor(25200 * 85)，
(1)其中85表示为5+80，即cx, cy, width, height, objness（框里是否含有目标）, classification * 80类别置信度
（2）25200 = （20 * 20 + 40 * 40 + 80 * 80） * 3，其中20 * 20， 40 * 40， 80 * 80分别是提取深、中、浅层的网格大小，浅层用来预测小目标，深层用来预测大目标；* 3是因为每个点有三种anchor
使用核函数对yolov5推理结果进行解码并恢复成框
研究过程，可以把pytorch的数据转换为numpy，通过tobytes写到文件，再使用C++进行读取，快速进行问题研究和排查，不需要tensorRT推理也可以进行后处理研究
对于类似GPU解码这种复杂实现，可以先实现CPU解码，再进行修改

二、后处理CPU解码

CPU解码的重点思想

避免多余计算，有些数学运算需要的时间远超过很多if语句，减少计算的次数就是性能的关键
nms实现的优化，例如使用remove_flag且预先分配内存，reserve对输出分配内存

1. 解码返回的结果为预测框

返回类型vector，Box为预测框的结构体

struct Box{
    float left, top, right, bottom, confidence;
    int label;

    Box() = default;
    Box(float left, float top, float right, float bottom, float confidence, int label):
    left(left), top(top), right(right), bottom(bottom), confidence(confidence), label(label){}
};

2. 避免多余计算，先对objness进行判断

 auto data = load_file("predict.data");
 auto image = cv::imread("input-image.jpg");
 float* ptr = (float*)data.data();
 int nelem = data.size() / sizeof(float);
 int ncols = 85;
 int nrows = nelem / ncols;
 auto boxes = cpu_decode(ptr, nrows, ncols);
	vector<Box> cpu_decode(float* predict, int rows, int cols, float confidence_threshold = 0.25f, float nms_threshold = 0.45f){
		vector<Box> boxes;
    int num_classes = cols - 5;
    for(int i = 0; i < rows; ++i){
        float* pitem = predict + i * cols;
        float objness = pitem[4];
        if(objness < confidence_threshold)
            continue;

3. 避免多余计算，再对置信度进行判断

        float* pclass = pitem + 5;
        int label     = std::max_element(pclass, pclass + num_classes) - pclass;
        float prob    = pclass[label];
        float confidence = prob * objness;
        if(confidence < confidence_threshold)
            continue;

4. 含有目标且置信度大于阈值的存入boxes

        float cx     = pitem[0];
        float cy     = pitem[1];
        float width  = pitem[2];
        float height = pitem[3];
        float left   = cx - width * 0.5;
        float top    = cy - height * 0.5;
        float right  = cx + width * 0.5;
        float bottom = cy + height * 0.5;
        boxes.emplace_back(left, top, right, bottom, confidence, (float)label);

5. nms的准备工作

boxes的置信度进行排序，lamda表达式的参数给引用，避免拷贝
std::sort(boxes.begin(), boxes.end(), [](Box& a, Box& b){return a.confidence > b.confidence;});
为避免boxes中框删除过程，vector中框的移动造成时间浪费，采用remove_flags的标记来记录是否删除，true表示已删除
std::vector<bool> remove_flags(boxes.size());
vector<>.reserve提前分配空间，避免多次使用push_back分配内存，性能降低

std::vector<Box> box_result;
box_result.reserve(boxes.size());

5. iou计算表达式

    auto iou = [](const Box& a, const Box& b){
        float cross_left   = std::max(a.left, b.left);
        float cross_top    = std::max(a.top, b.top);
        float cross_right  = std::min(a.right, b.right);
        float cross_bottom = std::min(a.bottom, b.bottom);

        float cross_area = std::max(0.0f, cross_right - cross_left) * std::max(0.0f, cross_bottom - cross_top);
        float union_area = std::max(0.0f, a.right - a.left) * std::max(0.0f, a.bottom - a.top) 
                         + std::max(0.0f, b.right - b.left) * std::max(0.0f, b.bottom - b.top) - cross_area;
        if(cross_area == 0 || union_area == 0) return 0.0f;
        return cross_area / union_area;
    };

6. nms实现

for(int i = 0; i < boxes.size(); ++i){
        if(remove_flags[i]) continue; //判断是否已不是目标框

        auto& ibox = boxes[i];
        box_result.emplace_back(ibox);
        for(int j = i + 1; j < boxes.size(); ++j){
            if(remove_flags[j]) continue;

            auto& jbox = boxes[j];
            if(ibox.label == jbox.label){
                // class matched
                if(iou(ibox, jbox) >= nms_threshold)
                    remove_flags[j] = true;
            }
        }
    }

6. 画检测框

for(auto& box : boxes){
        cv::rectangle(image, cv::Point(box.left, box.top), cv::Point(box.right, box.bottom), cv::Scalar(0, 255, 0), 2);
        cv::putText(image, cv::format("%.2f", box.confidence), cv::Point(box.left, box.top - 7), 0, 0.8, cv::Scalar(0, 0, 255), 2, 16);
    }
    
    cv::imwrite("image-draw.jpg", image);

二、后处理GPU解码

GPU解码的重点思想

cpu中可是使用vector表示数量不确定的数组，gpu解码使用[count,box1,box2,…]的方式，此方法需要有最大数量限制
通过atomicAdd实现数组元素的加入，并返回索引
避免不必要的计算

1. 创建stream

    cudaStream_t stream = nullptr;
    checkRuntime(cudaStreamCreate(&stream));

2. 申请内存

定义最大目标框数量max_objects = 1000
每个框的元素有7个，NUM_BOX_ELEMENT = 7，分别为框的left, top, right, bottom，confidence置信度，class类别，keepflag（是否保留框的标志位，true保留）
sizeof(float)表示count，max_objects * NUM_BOX_ELEMENT * sizeof(float))表示框

float* predict_device = nullptr;
    float* output_device = nullptr;
    float* output_host = nullptr;
    int max_objects = 1000;
    int NUM_BOX_ELEMENT = 7;  // left, top, right, bottom, confidence, class, keepflag
    checkRuntime(cudaMalloc(&predict_device, rows * cols * sizeof(float)));
    //sizeof(float)表示count，max_objects * NUM_BOX_ELEMENT * sizeof(float))表示框
    checkRuntime(cudaMalloc(&output_device, sizeof(float) + max_objects * NUM_BOX_ELEMENT * sizeof(float)));
    checkRuntime(cudaMallocHost(&output_host, sizeof(float) + max_objects * NUM_BOX_ELEMENT * sizeof(float)));

    checkRuntime(cudaMemcpyAsync(predict_device, predict, rows * cols * sizeof(float), cudaMemcpyHostToDevice, stream));

3. gpu解码

调用cuda函数

decode_kernel_invoker(
        predict_device, rows, cols - 5, confidence_threshold, 
        nms_threshold, nullptr, output_device, max_objects, NUM_BOX_ELEMENT, stream
    );

定义block和grid的大小

   auto block = num_bboxes > 512 ? 512 : num_bboxes;
    auto grid = (num_bboxes + block - 1) / block; //向上取整

4. 解码核函数

调用解码核函数

    decode_kernel<<<grid, block, 0, stream>>>(
        predict, num_bboxes, num_classes, confidence_threshold, 
        invert_affine_matrix, parray, max_objects, NUM_BOX_ELEMENT
    );

解码核函数计算类似于cpu解码，避免不必要的计算
开启总框数量个线程，25200
int index = atomicAdd(parray, 1);计算count大小：parray = [count , box1, box2, box3 …,] atomicAdd -> count += 1 但返回的是未加前的count，详细见CUDA笔记一（atomicAdd）
超过符合阈值的最大框数量的return掉，实现动态数组 if(index >= max_objects) return;

static __global__ void decode_kernel(
    float* predict, int num_bboxes, int num_classes, float confidence_threshold, 
    float* invert_affine_matrix, float* parray, int max_objects, int NUM_BOX_ELEMENT
){  
    int position = blockDim.x * blockIdx.x + threadIdx.x;
    if (position >= num_bboxes) return;

    float* pitem     = predict + (5 + num_classes) * position;
    float objectness = pitem[4];
    if(objectness < confidence_threshold)
        return;

    float* class_confidence = pitem + 5;
    float confidence        = *class_confidence++;
    int label               = 0;
    for(int i = 1; i < num_classes; ++i, ++class_confidence){
        if(*class_confidence > confidence){
            confidence = *class_confidence;
            label      = i;
        }
    }

    confidence *= objectness;
    if(confidence < confidence_threshold)
        return;

    int index = atomicAdd(parray, 1);
    if(index >= max_objects)
        return;

    float cx         = *pitem++;
    float cy         = *pitem++;
    float width      = *pitem++;
    float height     = *pitem++;
    float left   = cx - width * 0.5f;
    float top    = cy - height * 0.5f;
    float right  = cx + width * 0.5f;
    float bottom = cy + height * 0.5f;
    // affine_project(invert_affine_matrix, left,  top,    &left,  &top);
    // affine_project(invert_affine_matrix, right, bottom, &right, &bottom);

    // left, top, right, bottom, confidence, class, keepflag
    float* pout_item = parray + 1 + index * NUM_BOX_ELEMENT;
    *pout_item++ = left;
    *pout_item++ = top;
    *pout_item++ = right;
    *pout_item++ = bottom;
    *pout_item++ = confidence;
    *pout_item++ = label;
    *pout_item++ = 1; // 1 = keep, 0 = ignore
}

5. nms核函数

调用nms核函数

启用最大框数量max_objects个线程

block = max_objects > 512 ? 512 : max_objects;
    grid = (max_objects + block - 1) / block;
    fast_nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold, NUM_BOX_ELEMENT);

6. 实现nms核函数

每个框都要单独对其余所有框进行nms计算是否是检测框
由于并行计算且if过滤掉很多条件，速度很快

static __global__ void fast_nms_kernel(float* bboxes, int max_objects, float threshold, int NUM_BOX_ELEMENT){

    int position = (blockDim.x * blockIdx.x + threadIdx.x);
    int count = min((int)*bboxes, max_objects);
    if (position >= count) 
        return;
    
    // left, top, right, bottom, confidence, class, keepflag
    float* pcurrent = bboxes + 1 + position * NUM_BOX_ELEMENT;
    for(int i = 0; i < count; ++i){
        float* pitem = bboxes + 1 + i * NUM_BOX_ELEMENT;
        if(i == position || pcurrent[5] != pitem[5]) continue;

        if(pitem[4] >= pcurrent[4]){
            if(pitem[4] == pcurrent[4] && i < position)
                continue;

            float iou = box_iou(
                pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3],
                pitem[0],    pitem[1],    pitem[2],    pitem[3]
            );

            if(iou > threshold){
                pcurrent[6] = 0;  // 1=keep, 0=ignore
                return;
            }
        }
    }
}

7. 检测框结果存储

    checkRuntime(cudaMemcpyAsync(output_host, output_device, 
        sizeof(int) + max_objects * NUM_BOX_ELEMENT * sizeof(float), 
        cudaMemcpyDeviceToHost, stream
    ));
    checkRuntime(cudaStreamSynchronize(stream));

    int num_boxes = min((int)output_host[0], max_objects);
    for(int i = 0; i < num_boxes; ++i){
        float* ptr = output_host + 1 + NUM_BOX_ELEMENT * i;
        int keep_flag = ptr[6];
        if(keep_flag){
            box_result.emplace_back(
                ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], (int)ptr[5]
            );
        }
    }
    checkRuntime(cudaStreamDestroy(stream));
    checkRuntime(cudaFree(predict_device));
    checkRuntime(cudaFree(output_device));
    checkRuntime(cudaFreeHost(output_host));
    return box_result;