tensorrt 加速yolov5 c++部署

最新推荐文章于 2024-05-28 14:47:52 发布

小涵涵

最新推荐文章于 2024-05-28 14:47:52 发布

阅读量883

点赞数

分类专栏： Tensorrt实现各类深度学习算法文章标签： c++ 深度学习 pytorch

本文链接：https://blog.csdn.net/qq_34929889/article/details/124942842

版权

Tensorrt实现各类深度学习算法专栏收录该内容

2 篇文章 3 订阅

订阅专栏

tensorrt 加速yolov5 c++部署

yolov5的onnx一体式导出
tensorrt c++部署
完整的代码链接

yolov5的onnx一体式导出

原版的yolov5 onnx导出的时候，导出的代码是if not self.training前边的程序段，不包含后续将检测物体映射到原图上，及后续的 if self.inplace: y[..., 0:2] = (y[..., 0:2] * 2 + self.grid[i]) * self.stride[i] # xy y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh代码，这是由于官方的写法会导致onnx转tensorrt时，这部分op不支持转tensorrt。

def forward(self, x):
        z = []  # inference output
        for i in range(self.nl):
            x[i] = self.m[i](x[i])  # conv
            bs, _, ny, nx = x[i].shape  # x(bs,255,20,20) to x(bs,3,20,20,85)
            x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

            if not self.training:  # inference
                if self.onnx_dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]:
                    self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)

                y = x[i].sigmoid()
                if self.inplace:
                    y[..., 0:2] = (y[..., 0:2] * 2 + self.grid[i]) * self.stride[i]  # xy
                    y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
                else:  # for YOLOv5 on AWS Inferentia https://github.com/ultralytics/yolov5/pull/2953
                    xy, wh, conf = y.split((2, 2, self.nc + 1), 4)  # y.tensor_split((2, 4, 5), 4)  # torch 1.8.0
                    xy = (xy * 2 + self.grid[i]) * self.stride[i]  # xy
                    wh = (wh * 2) ** 2 * self.anchor_grid[i]  # wh
                    y = torch.cat((xy, wh, conf), 4)
                z.append(y.view(bs, -1, self.no))

        return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x)

因此需要将这部分代码修改如下：

    def forward(self, x):
        # x = x.copy()  # for profiling
        z = []  # inference output
        # self.training |= self.export
        for i in range(self.nl):
            x[i] = self.m[i](x[i])  # conv
            if self.export:
                print('exporting...')
                s = x[i].shape
                bs, _, ny, nx = s  # x(bs,255,20,20) to x(bs,3,20,20,85)
                x_i = x[i]
                x_i = x_i.view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
            else:
                bs, _, ny, nx = x[i].shape  # x(bs,255,20,20) to x(bs,3,20,20,85)
                x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

            if not self.training or self.export:
                #self.anchor_grid_awesome=torch.tensor([[5,6,8,21,21,16],[15,45,47,18,55,32],[39,90,112,60,158,162]]).view(self.nl, 1, -1, 1, 1, 2)
                # inference
                if self.export:
                    self.grid = [i.to(x_i.device) for i in self.grid]
                    self.a = self.anchor_grid_awesome[i]
                    # self.b=self.anchor_grid[i]
                    # self.a=self.b
                    #self.b=self.b.long()
                    # c=(self.a==self.b.sum())
                    # print(self.a)
                    # print(self.b)
                    # if not self.a.equal(self.b):
                    #     print("errot")
                    if self.grid[i].shape[2:4] != x_i.shape[2:4]:
                        self.grid[i] = self._make_grid(nx, ny).to(x_i.device)
                    y = x_i.sigmoid()
                    print('[WARN] you are calling export...')
                    x1y1, x2y2, conf, prob = torch.split(y, [2, 2, 1, self.nc], dim=4)
                    x1y1 = ((x1y1 * 2. - 0.5 + self.grid[i].to(x_i.device)) * self.stride[i]).type(x_i.dtype)
                    x2y2 = (x2y2 * 2) ** 2 * self.a
                    xyxy = torch.cat((x1y1, x2y2), dim=4)
                    # # add a idx (label ids before prob)
                    idxs = torch.argmax(prob, dim=-1).unsqueeze(axis=-1).type(x_i.dtype).to(x_i.device)
                    y = torch.cat((xyxy, conf, idxs, prob), dim=4).to(x_i.device)

                    # we added idxs so no+1
                    z.append(y.view(bs, -1, self.no + 1))
                    # y = torch.cat((xyxy, conf, prob), dim=4).to(x_i.device)
                    #z.append(y.view(bs, -1, self.no))

                else:
                    if self.grid[i].shape[2:4] != x[i].shape[2:4]:
                        self.grid[i] = self._make_grid(nx, ny).to(x[i].device)

                    y = x[i].sigmoid()
                    y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
                    y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
                    z.append(y.view(bs, -1, self.no))

        if self.training:
            return x if self.training else (torch.cat(z, 1), x)
        elif self.export:
            return torch.cat(z, 1)
        else:
            return (torch.cat(z, 1), x)

这样导出的onnx如下图所示，可以看到最后的输出是2274310的float数组，22743对应特征图的大小37676,33838,319*19,10的前四位代表坐标，5代表conf置信度，6代表类别，6-10代表每个类别的得分。
经过这样的onnx导出就不需要在编写plugin解码三个不同特征图的box到原图，极大的减少了工作量。
在这里插入图片描述

tensorrt c++部署

tensorrt引擎

引擎的构建详情参考本专栏的另一篇文章

前处理

padding resize

std::vector<float> yolo::prepareImage(std::vector<cv::Mat> &vec_img) {
    std::vector<float> result(mBatchSize * w * h * c);
    float *data = result.data();
    for (const cv::Mat &src_img : vec_img)
    {
        if (!src_img.data)
            continue;
        ratio = std::min(float(w) / float(src_img.cols), float(h) / float(src_img.rows));
        cv::Mat flt_img = cv::Mat::zeros(cv::Size(w, h), CV_8UC3);
        cv::Mat rsz_img;
        cv::resize(src_img, rsz_img, cv::Size(), ratio, ratio);
        rsz_img.copyTo(flt_img(cv::Rect(0, 0, rsz_img.cols, rsz_img.rows)));
        flt_img.convertTo(flt_img, CV_32FC3, 1.0 / 255);

        //HWC TO CHW
        std::vector<cv::Mat> split_img(c);
        cv::split(flt_img, split_img);

        int channelLength = w * h;
        for (int i = 0; i < c; ++i)
        {
          //  split_img[i] = (split_img[i] - img_mean[i]) / img_std[i];
            memcpy(data, split_img[i].data, channelLength * sizeof(float));
            data += channelLength;
        }
    }
    return result;
}

后处理

int yolo::postProcess( std::vector<cv::Mat> &vec_Mat, float *output,map<int,vector<mybox>>&result) {
    int p_size=vec_Mat.size();
     int a=0;
    for(auto & vec:vec_Mat)
    {
        float*  temp_out=output+a*227430;
        vector<mybox> temp_boxvc;
        for (int i=0;i<227430;)
        {
            if (temp_out[4]>0.6)
            {
                mybox temp_box;
                float xc=temp_out[0]/ratio;
                float yc=temp_out[1]/ratio;
                float w=temp_out[2]/ratio;
                float h=temp_out[3]/ratio;
                int xmin=xc-w/2.0;
                int ymin=yc-h/2.0;
                int xmax=xc+w/2.0;
                int ymax=yc+h/2.0;
                xmin=xmin>0 ?xmin:0;
                ymin=ymin>0? ymin:0;
                int cla=int(temp_out[5]);
                float score=temp_out[6+cla];
                temp_box.xmin=xmin;
                temp_box.ymin=ymin;
                temp_box.xmax=xmax;
                temp_box.ymax=ymax;
                temp_box.cla=cla;
                temp_box.prob=score;
                temp_boxvc.push_back(temp_box);
            }

            temp_out=temp_out+10;
            i=i+10;
        }
        result[a]=temp_boxvc;
        a=a+1;
    }

nms

void DoNms(std::vector<mybox>& detections,int classes ,float nmsThresh)
{
    using namespace std;
    // auto t_start = chrono::high_resolution_clock::now();

    std::vector<std::vector<mybox>> resClass;
    resClass.resize(classes);

    for (const auto& item : detections)
        resClass[item.cla].push_back(item);

    auto iouCompute = [](float * lbox, float* rbox)
    {
        float interBox[] = {
                max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left
                min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right
                max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top
                min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom
        };

        if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
            return 0.0f;

        float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]);
        return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS);
    };

    std::vector<mybox> result;
    for (int i = 0;i<classes;++i)
    {
        auto& dets =resClass[i];
        if(dets.size() == 0)
            continue;

        sort(dets.begin(),dets.end(),[=](const mybox& left,const mybox& right){
            return left.prob > right.prob;
        });

        for (unsigned int m = 0;m < dets.size() ; ++m)
        {
            auto& item = dets[m];
            result.push_back(item);
            float t1[4];

            t1[0]=item.xmin+(item.xmax-item.xmin)/2.0;
            t1[1]=item.ymin+(item.ymax-item.ymin)/2.0;
            t1[2]=item.xmax-item.xmin;
            t1[3]=item.ymax-item.ymin;
            for(unsigned int n = m + 1;n < dets.size() ; ++n)
            {
                float t2[4];

                t2[0]=dets[n].xmin+(dets[n].xmax-dets[n].xmin)/2.0;
                t2[1]=dets[n].ymin+(dets[n].ymax-dets[n].ymin)/2.0;
                t2[2]=dets[n].xmax-dets[n].xmin;
                t2[3]=dets[n].ymax-dets[n].ymin;
                float t=iouCompute(t1,t2);
                if (iouCompute(t1,t2) > nmsThresh)
                {
                    dets.erase(dets.begin()+n);
                    --n;
                }
            }
        }
    }

    //swap(detections,result);
    detections = move(result);

    // auto t_end = chrono::high_resolution_clock::now();
    // float total = chrono::duration<float, milli>(t_end - t_start).count();
    // cout << "Time taken for nms is " << total << " ms." << endl;
}

完整的代码链接

帮忙github点个星

小涵涵

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
打赏
1
评论
tensorrt 加速yolov5 c++部署

tensorrt 加速yolov5 c++部署yolov5的onnx一体式导出tensorrt c++部署tensorrt引擎前处理后处理nms完整的代码链接yolov5的onnx一体式导出原版的yolov5 onnx导出的时候，导出的代码是if not self.training前边的程序段，不包含后续将检测物体映射到原图上，及后续的 if self.inplace: y[..., 0:2] = (y[..., 0:2] * 2 + self.grid[i])
复制链接

扫一扫