tensorrt 加速yolov5 c++部署

yolov5的onnx一体式导出

原版的yolov5 onnx导出的时候,导出的代码是if not self.training前边的程序段,不包含后续将检测物体映射到原图上,及后续的 if self.inplace: y[..., 0:2] = (y[..., 0:2] * 2 + self.grid[i]) * self.stride[i] # xy y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh代码,这是由于官方的写法会导致onnx转tensorrt时,这部分op不支持转tensorrt。

def forward(self, x):
        z = []  # inference output
        for i in range(self.nl):
            x[i] = self.m[i](x[i])  # conv
            bs, _, ny, nx = x[i].shape  # x(bs,255,20,20) to x(bs,3,20,20,85)
            x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

            if not self.training:  # inference
                if self.onnx_dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]:
                    self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)

                y = x[i].sigmoid()
                if self.inplace:
                    y[..., 0:2] = (y[..., 0:2] * 2 + self.grid[i]) * self.stride[i]  # xy
                    y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
                else:  # for YOLOv5 on AWS Inferentia https://github.com/ultralytics/yolov5/pull/2953
                    xy, wh, conf = y.split((2, 2, self.nc + 1), 4)  # y.tensor_split((2, 4, 5), 4)  # torch 1.8.0
                    xy = (xy * 2 + self.grid[i]) * self.stride[i]  # xy
                    wh = (wh * 2) ** 2 * self.anchor_grid[i]  # wh
                    y = torch.cat((xy, wh, conf), 4)
                z.append(y.view(bs, -1, self.no))

        return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x)

因此需要将这部分代码修改如下:

    def forward(self, x):
        # x = x.copy()  # for profiling
        z = []  # inference output
        # self.training |= self.export
        for i in range(self.nl):
            x[i] = self.m[i](x[i])  # conv
            if self.export:
                print('exporting...')
                s = x[i].shape
                bs, _, ny, nx = s  # x(bs,255,20,20) to x(bs,3,20,20,85)
                x_i = x[i]
                x_i = x_i.view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
            else:
                bs, _, ny, nx = x[i].shape  # x(bs,255,20,20) to x(bs,3,20,20,85)
                x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()

            if not self.training or self.export:
                #self.anchor_grid_awesome=torch.tensor([[5,6,8,21,21,16],[15,45,47,18,55,32],[39,90,112,60,158,162]]).view(self.nl, 1, -1, 1, 1, 2)
                # inference
                if self.export:
                    self.grid = [i.to(x_i.device) for i in self.grid]
                    self.a = self.anchor_grid_awesome[i]
                    # self.b=self.anchor_grid[i]
                    # self.a=self.b
                    #self.b=self.b.long()
                    # c=(self.a==self.b.sum())
                    # print(self.a)
                    # print(self.b)
                    # if not self.a.equal(self.b):
                    #     print("errot")
                    if self.grid[i].shape[2:4] != x_i.shape[2:4]:
                        self.grid[i] = self._make_grid(nx, ny).to(x_i.device)
                    y = x_i.sigmoid()
                    print('[WARN] you are calling export...')
                    x1y1, x2y2, conf, prob = torch.split(y, [2, 2, 1, self.nc], dim=4)
                    x1y1 = ((x1y1 * 2. - 0.5 + self.grid[i].to(x_i.device)) * self.stride[i]).type(x_i.dtype)
                    x2y2 = (x2y2 * 2) ** 2 * self.a
                    xyxy = torch.cat((x1y1, x2y2), dim=4)
                    # # add a idx (label ids before prob)
                    idxs = torch.argmax(prob, dim=-1).unsqueeze(axis=-1).type(x_i.dtype).to(x_i.device)
                    y = torch.cat((xyxy, conf, idxs, prob), dim=4).to(x_i.device)

                    # we added idxs so no+1
                    z.append(y.view(bs, -1, self.no + 1))
                    # y = torch.cat((xyxy, conf, prob), dim=4).to(x_i.device)
                    #z.append(y.view(bs, -1, self.no))

                else:
                    if self.grid[i].shape[2:4] != x[i].shape[2:4]:
                        self.grid[i] = self._make_grid(nx, ny).to(x[i].device)

                    y = x[i].sigmoid()
                    y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
                    y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
                    z.append(y.view(bs, -1, self.no))

        if self.training:
            return x if self.training else (torch.cat(z, 1), x)
        elif self.export:
            return torch.cat(z, 1)
        else:
            return (torch.cat(z, 1), x)

这样导出的onnx如下图所示,可以看到最后的输出是2274310的float数组,22743对应特征图的大小37676,33838,319*19,10的前四位代表坐标,5代表conf置信度,6代表类别,6-10代表每个类别的得分。
经过这样的onnx导出就不需要在编写plugin解码三个不同特征图的box到原图,极大的减少了工作量。
在这里插入图片描述

tensorrt c++部署

tensorrt引擎

引擎的构建详情参考本专栏的另一篇文章

前处理

padding resize

std::vector<float> yolo::prepareImage(std::vector<cv::Mat> &vec_img) {
    std::vector<float> result(mBatchSize * w * h * c);
    float *data = result.data();
    for (const cv::Mat &src_img : vec_img)
    {
        if (!src_img.data)
            continue;
        ratio = std::min(float(w) / float(src_img.cols), float(h) / float(src_img.rows));
        cv::Mat flt_img = cv::Mat::zeros(cv::Size(w, h), CV_8UC3);
        cv::Mat rsz_img;
        cv::resize(src_img, rsz_img, cv::Size(), ratio, ratio);
        rsz_img.copyTo(flt_img(cv::Rect(0, 0, rsz_img.cols, rsz_img.rows)));
        flt_img.convertTo(flt_img, CV_32FC3, 1.0 / 255);

        //HWC TO CHW
        std::vector<cv::Mat> split_img(c);
        cv::split(flt_img, split_img);

        int channelLength = w * h;
        for (int i = 0; i < c; ++i)
        {
          //  split_img[i] = (split_img[i] - img_mean[i]) / img_std[i];
            memcpy(data, split_img[i].data, channelLength * sizeof(float));
            data += channelLength;
        }
    }
    return result;
}

后处理

int yolo::postProcess( std::vector<cv::Mat> &vec_Mat, float *output,map<int,vector<mybox>>&result) {
    int p_size=vec_Mat.size();
     int a=0;
    for(auto & vec:vec_Mat)
    {
        float*  temp_out=output+a*227430;
        vector<mybox> temp_boxvc;
        for (int i=0;i<227430;)
        {
            if (temp_out[4]>0.6)
            {
                mybox temp_box;
                float xc=temp_out[0]/ratio;
                float yc=temp_out[1]/ratio;
                float w=temp_out[2]/ratio;
                float h=temp_out[3]/ratio;
                int xmin=xc-w/2.0;
                int ymin=yc-h/2.0;
                int xmax=xc+w/2.0;
                int ymax=yc+h/2.0;
                xmin=xmin>0 ?xmin:0;
                ymin=ymin>0? ymin:0;
                int cla=int(temp_out[5]);
                float score=temp_out[6+cla];
                temp_box.xmin=xmin;
                temp_box.ymin=ymin;
                temp_box.xmax=xmax;
                temp_box.ymax=ymax;
                temp_box.cla=cla;
                temp_box.prob=score;
                temp_boxvc.push_back(temp_box);
            }

            temp_out=temp_out+10;
            i=i+10;
        }
        result[a]=temp_boxvc;
        a=a+1;
    }

nms

void DoNms(std::vector<mybox>& detections,int classes ,float nmsThresh)
{
    using namespace std;
    // auto t_start = chrono::high_resolution_clock::now();

    std::vector<std::vector<mybox>> resClass;
    resClass.resize(classes);

    for (const auto& item : detections)
        resClass[item.cla].push_back(item);

    auto iouCompute = [](float * lbox, float* rbox)
    {
        float interBox[] = {
                max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left
                min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right
                max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top
                min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom
        };

        if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
            return 0.0f;

        float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]);
        return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS);
    };

    std::vector<mybox> result;
    for (int i = 0;i<classes;++i)
    {
        auto& dets =resClass[i];
        if(dets.size() == 0)
            continue;

        sort(dets.begin(),dets.end(),[=](const mybox& left,const mybox& right){
            return left.prob > right.prob;
        });

        for (unsigned int m = 0;m < dets.size() ; ++m)
        {
            auto& item = dets[m];
            result.push_back(item);
            float t1[4];

            t1[0]=item.xmin+(item.xmax-item.xmin)/2.0;
            t1[1]=item.ymin+(item.ymax-item.ymin)/2.0;
            t1[2]=item.xmax-item.xmin;
            t1[3]=item.ymax-item.ymin;
            for(unsigned int n = m + 1;n < dets.size() ; ++n)
            {
                float t2[4];

                t2[0]=dets[n].xmin+(dets[n].xmax-dets[n].xmin)/2.0;
                t2[1]=dets[n].ymin+(dets[n].ymax-dets[n].ymin)/2.0;
                t2[2]=dets[n].xmax-dets[n].xmin;
                t2[3]=dets[n].ymax-dets[n].ymin;
                float t=iouCompute(t1,t2);
                if (iouCompute(t1,t2) > nmsThresh)
                {
                    dets.erase(dets.begin()+n);
                    --n;
                }
            }
        }
    }

    //swap(detections,result);
    detections = move(result);

    // auto t_end = chrono::high_resolution_clock::now();
    // float total = chrono::duration<float, milli>(t_end - t_start).count();
    // cout << "Time taken for nms is " << total << " ms." << endl;
}

完整的代码链接

帮忙github点个星

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小涵涵

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值