tensorrt 加速yolov5 c++部署
yolov5的onnx一体式导出
原版的yolov5 onnx导出的时候,导出的代码是if not self.training
前边的程序段,不包含后续将检测物体映射到原图上,及后续的 if self.inplace: y[..., 0:2] = (y[..., 0:2] * 2 + self.grid[i]) * self.stride[i] # xy y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh
代码,这是由于官方的写法会导致onnx转tensorrt时,这部分op不支持转tensorrt。
def forward(self, x):
z = [] # inference output
for i in range(self.nl):
x[i] = self.m[i](x[i]) # conv
bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)
x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
if not self.training: # inference
if self.onnx_dynamic or self.grid[i].shape[2:4] != x[i].shape[2:4]:
self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)
y = x[i].sigmoid()
if self.inplace:
y[..., 0:2] = (y[..., 0:2] * 2 + self.grid[i]) * self.stride[i] # xy
y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh
else: # for YOLOv5 on AWS Inferentia https://github.com/ultralytics/yolov5/pull/2953
xy, wh, conf = y.split((2, 2, self.nc + 1), 4) # y.tensor_split((2, 4, 5), 4) # torch 1.8.0
xy = (xy * 2 + self.grid[i]) * self.stride[i] # xy
wh = (wh * 2) ** 2 * self.anchor_grid[i] # wh
y = torch.cat((xy, wh, conf), 4)
z.append(y.view(bs, -1, self.no))
return x if self.training else (torch.cat(z, 1),) if self.export else (torch.cat(z, 1), x)
因此需要将这部分代码修改如下:
def forward(self, x):
# x = x.copy() # for profiling
z = [] # inference output
# self.training |= self.export
for i in range(self.nl):
x[i] = self.m[i](x[i]) # conv
if self.export:
print('exporting...')
s = x[i].shape
bs, _, ny, nx = s # x(bs,255,20,20) to x(bs,3,20,20,85)
x_i = x[i]
x_i = x_i.view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
else:
bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)
x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
if not self.training or self.export:
#self.anchor_grid_awesome=torch.tensor([[5,6,8,21,21,16],[15,45,47,18,55,32],[39,90,112,60,158,162]]).view(self.nl, 1, -1, 1, 1, 2)
# inference
if self.export:
self.grid = [i.to(x_i.device) for i in self.grid]
self.a = self.anchor_grid_awesome[i]
# self.b=self.anchor_grid[i]
# self.a=self.b
#self.b=self.b.long()
# c=(self.a==self.b.sum())
# print(self.a)
# print(self.b)
# if not self.a.equal(self.b):
# print("errot")
if self.grid[i].shape[2:4] != x_i.shape[2:4]:
self.grid[i] = self._make_grid(nx, ny).to(x_i.device)
y = x_i.sigmoid()
print('[WARN] you are calling export...')
x1y1, x2y2, conf, prob = torch.split(y, [2, 2, 1, self.nc], dim=4)
x1y1 = ((x1y1 * 2. - 0.5 + self.grid[i].to(x_i.device)) * self.stride[i]).type(x_i.dtype)
x2y2 = (x2y2 * 2) ** 2 * self.a
xyxy = torch.cat((x1y1, x2y2), dim=4)
# # add a idx (label ids before prob)
idxs = torch.argmax(prob, dim=-1).unsqueeze(axis=-1).type(x_i.dtype).to(x_i.device)
y = torch.cat((xyxy, conf, idxs, prob), dim=4).to(x_i.device)
# we added idxs so no+1
z.append(y.view(bs, -1, self.no + 1))
# y = torch.cat((xyxy, conf, prob), dim=4).to(x_i.device)
#z.append(y.view(bs, -1, self.no))
else:
if self.grid[i].shape[2:4] != x[i].shape[2:4]:
self.grid[i] = self._make_grid(nx, ny).to(x[i].device)
y = x[i].sigmoid()
y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i] # xy
y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh
z.append(y.view(bs, -1, self.no))
if self.training:
return x if self.training else (torch.cat(z, 1), x)
elif self.export:
return torch.cat(z, 1)
else:
return (torch.cat(z, 1), x)
这样导出的onnx如下图所示,可以看到最后的输出是2274310的float数组,22743对应特征图的大小37676,33838,319*19,10的前四位代表坐标,5代表conf置信度,6代表类别,6-10代表每个类别的得分。
经过这样的onnx导出就不需要在编写plugin解码三个不同特征图的box到原图,极大的减少了工作量。
tensorrt c++部署
tensorrt引擎
前处理
padding resize
std::vector<float> yolo::prepareImage(std::vector<cv::Mat> &vec_img) {
std::vector<float> result(mBatchSize * w * h * c);
float *data = result.data();
for (const cv::Mat &src_img : vec_img)
{
if (!src_img.data)
continue;
ratio = std::min(float(w) / float(src_img.cols), float(h) / float(src_img.rows));
cv::Mat flt_img = cv::Mat::zeros(cv::Size(w, h), CV_8UC3);
cv::Mat rsz_img;
cv::resize(src_img, rsz_img, cv::Size(), ratio, ratio);
rsz_img.copyTo(flt_img(cv::Rect(0, 0, rsz_img.cols, rsz_img.rows)));
flt_img.convertTo(flt_img, CV_32FC3, 1.0 / 255);
//HWC TO CHW
std::vector<cv::Mat> split_img(c);
cv::split(flt_img, split_img);
int channelLength = w * h;
for (int i = 0; i < c; ++i)
{
// split_img[i] = (split_img[i] - img_mean[i]) / img_std[i];
memcpy(data, split_img[i].data, channelLength * sizeof(float));
data += channelLength;
}
}
return result;
}
后处理
int yolo::postProcess( std::vector<cv::Mat> &vec_Mat, float *output,map<int,vector<mybox>>&result) {
int p_size=vec_Mat.size();
int a=0;
for(auto & vec:vec_Mat)
{
float* temp_out=output+a*227430;
vector<mybox> temp_boxvc;
for (int i=0;i<227430;)
{
if (temp_out[4]>0.6)
{
mybox temp_box;
float xc=temp_out[0]/ratio;
float yc=temp_out[1]/ratio;
float w=temp_out[2]/ratio;
float h=temp_out[3]/ratio;
int xmin=xc-w/2.0;
int ymin=yc-h/2.0;
int xmax=xc+w/2.0;
int ymax=yc+h/2.0;
xmin=xmin>0 ?xmin:0;
ymin=ymin>0? ymin:0;
int cla=int(temp_out[5]);
float score=temp_out[6+cla];
temp_box.xmin=xmin;
temp_box.ymin=ymin;
temp_box.xmax=xmax;
temp_box.ymax=ymax;
temp_box.cla=cla;
temp_box.prob=score;
temp_boxvc.push_back(temp_box);
}
temp_out=temp_out+10;
i=i+10;
}
result[a]=temp_boxvc;
a=a+1;
}
nms
void DoNms(std::vector<mybox>& detections,int classes ,float nmsThresh)
{
using namespace std;
// auto t_start = chrono::high_resolution_clock::now();
std::vector<std::vector<mybox>> resClass;
resClass.resize(classes);
for (const auto& item : detections)
resClass[item.cla].push_back(item);
auto iouCompute = [](float * lbox, float* rbox)
{
float interBox[] = {
max(lbox[0] - lbox[2]/2.f , rbox[0] - rbox[2]/2.f), //left
min(lbox[0] + lbox[2]/2.f , rbox[0] + rbox[2]/2.f), //right
max(lbox[1] - lbox[3]/2.f , rbox[1] - rbox[3]/2.f), //top
min(lbox[1] + lbox[3]/2.f , rbox[1] + rbox[3]/2.f), //bottom
};
if(interBox[2] > interBox[3] || interBox[0] > interBox[1])
return 0.0f;
float interBoxS =(interBox[1]-interBox[0])*(interBox[3]-interBox[2]);
return interBoxS/(lbox[2]*lbox[3] + rbox[2]*rbox[3] -interBoxS);
};
std::vector<mybox> result;
for (int i = 0;i<classes;++i)
{
auto& dets =resClass[i];
if(dets.size() == 0)
continue;
sort(dets.begin(),dets.end(),[=](const mybox& left,const mybox& right){
return left.prob > right.prob;
});
for (unsigned int m = 0;m < dets.size() ; ++m)
{
auto& item = dets[m];
result.push_back(item);
float t1[4];
t1[0]=item.xmin+(item.xmax-item.xmin)/2.0;
t1[1]=item.ymin+(item.ymax-item.ymin)/2.0;
t1[2]=item.xmax-item.xmin;
t1[3]=item.ymax-item.ymin;
for(unsigned int n = m + 1;n < dets.size() ; ++n)
{
float t2[4];
t2[0]=dets[n].xmin+(dets[n].xmax-dets[n].xmin)/2.0;
t2[1]=dets[n].ymin+(dets[n].ymax-dets[n].ymin)/2.0;
t2[2]=dets[n].xmax-dets[n].xmin;
t2[3]=dets[n].ymax-dets[n].ymin;
float t=iouCompute(t1,t2);
if (iouCompute(t1,t2) > nmsThresh)
{
dets.erase(dets.begin()+n);
--n;
}
}
}
}
//swap(detections,result);
detections = move(result);
// auto t_end = chrono::high_resolution_clock::now();
// float total = chrono::duration<float, milli>(t_end - t_start).count();
// cout << "Time taken for nms is " << total << " ms." << endl;
}