YOLO v5 TensorRT推理
#include "iostream"
#include "NvInfer.h"
#include <fstream>
#include <sstream>
#include <assert.h>
#include <vector>
#include <numeric>
#include "opencv2/opencv.hpp"
#include <algorithm>
#include <math.h>
#include <ctime>
#include <math.h>
#include <fstream>
#define LOCATIONS 4
#define MAX_OUTPUT_BBOX_COUNT 30000
#define MAX_DET 300
#define PERSIZE 6
#define INPUT_W 416
#define INPUT_H 416
struct Detection {
//center_x center_y w h
float bbox[LOCATIONS];
float conf; // bbox_conf * cls_conf
float class_air;
float class_oxy;
};
struct ResultDetection {
//center_x center_y w h
float bbox[LOCATIONS];
float conf; // bbox_conf * cls_conf
int class_id;
int suppressed;
};
class Logger : public nvinfer1::ILogger
{
void log(Severity severity, const char* msg) noexcept override
{
// suppress info-level messages
if (severity <= Severity::kWARNING)
std::cout << msg << std::endl;
}
} glogger;
unsigned int getElementSize(nvinfer1::DataType t)
{
switch (t)
{
case nvinfer1::DataType::kINT32: return 4;
case nvinfer1::DataType::kFLOAT: return 4;
case nvinfer1::DataType::kHALF: return 2;
case nvinfer1::DataType::kBOOL:
case nvinfer1::DataType::kINT8: return 1;
}
throw std::runtime_error("Invalid DataType.");
return 0;
}
int64_t volume(const nvinfer1::Dims& d)
{
return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int64_t>());
}
nvinfer1::ICudaEngine* loadEngine(std::fstream& file){
std::string cacheEngine = "";
while(file.peek() != EOF){
std::stringstream buffer;
buffer << file.rdbuf();
cacheEngine.append(buffer.str());
}
file.close();
nvinfer1::IRuntime* trtRuntime = nvinfer1::createInferRuntime(glogger);
nvinfer1::ICudaEngine* engine = trtRuntime->deserializeCudaEngine(cacheEngine.data(), cacheEngine.size(), nullptr);
assert(engine != nullptr);
return engine;
}
/*
newShape = [width, height]
*/
float letterbox(cv::Mat& image, int* newShape, int* color, int aut=0){
int width = image.cols;
int height = image.rows;
// 获取比较小的缩放比例
double r = std::min((float)newShape[0]/width, (float)newShape[1]/height);
// 计算padding
int newUnpad[2] = {(int)round(width * r), (int)round(height * r)};
// 计算差值
float dw = newShape[0] - newUnpad[0];
float dh = newShape[1] - newUnpad[1];
if(aut){
dw = (int)dw % 32;
dh = (int)dh % 32;
}
// 取对半
dw = dw / 2;
dh = dh / 2;
cv::resize(image, image, cv::Size(newUnpad[0], newUnpad[1]));
// 计算坐标
int top = (int)round(dh - 0.1);
int bottom = (int)round(dh + 0.1);
int left = (int)round(dw - 0.1);
int right = (int)round(dw + 0.1);
cv::copyMakeBorder(image,image,top,bottom,left,right,cv::BORDER_CONSTANT,cv::Scalar(color[0],color[1],color[2]));
return r;
}
void xywh2xyxy(float *xywh, float * xyxy){
xyxy[0] = (float)(xywh[0] - xywh[2] / 2);
xyxy[1] = (float)(xywh[1] - xywh[3] / 2);
xyxy[2] = (float)(xywh[0] + xywh[2] / 2);
xyxy[3] = (float)(xywh[1] + xywh[3] / 2);
}
float iou(float lbox[4], float rbox[4]) {
// float interBox[] = {
// (std::max)(lbox[0] - lbox[2] / 2.f , rbox[0] - rbox[2] / 2.f), //left
// (std::min)(lbox[0] + lbox[2] / 2.f , rbox[0] + rbox[2] / 2.f), //right
// (std::max)(lbox[1] - lbox[3] / 2.f , rbox[1] - rbox[3] / 2.f), //top
// (std::min)(lbox[1] + lbox[3] / 2.f , rbox[1] + rbox[3] / 2.f), //bottom
// };
if (lbox[2] > lbox[3] || lbox[0] > lbox[1])
return 0.0f;
float interBoxS = (lbox[1] - lbox[0])*(lbox[3] - lbox[2]);
return interBoxS / ((lbox[1] - lbox[0])*(lbox[3] - lbox[2])+ (rbox[1] - rbox[0])*(rbox[3] - rbox[2]) - interBoxS);
}
bool cmp(const ResultDetection& a, const ResultDetection& b) {
return a.conf > b.conf;
}
void compareAndfilter(std::vector<float>& oriList, std::vector<ResultDetection>& resultList, int max_size){
// 获取原始长度
int len = oriList.size() / PERSIZE;
// 遍历 ===================================================================当前状态下很难会产生超过max_size的候选框,故此不做对置信度的排序 python代码-》 x = x[x[:, 4].argsort(descending=True)[:max_nms]]
for(int j = 0; j < max_size; ++j){
int max = 0;
// for(int i = 0;i < len; ++i){
for(int k = 1; k < len; ++k){
if(oriList[k * PERSIZE + 4] > oriList[max * PERSIZE + 4]){
max = k;
}
}
std::cout << max << std::endl;
ResultDetection temp = {oriList[max * PERSIZE + 0], oriList[max * PERSIZE + 1], oriList[max * PERSIZE + 2], oriList[max * PERSIZE + 3], oriList[max * PERSIZE + 4], (int)oriList[max * PERSIZE + 5], 0};
resultList.emplace_back(temp);
oriList[max * PERSIZE + 4] = -1;
}
}
void nms(std::vector<ResultDetection>& res, float *output, int outSize, float conf_thresh, float nms_thresh = 0.5){
int det_size = sizeof(Detection) / sizeof(float);
// 筛选出第一轮结果,根据conf > conf_thresh
std::vector<int> oneList;
for(int i = 0; i < outSize / det_size; ++i){
if(output[det_size * i + 4] > conf_thresh){
// 记录下所有置信度大于conf_thresh的标签
oneList.emplace_back(i);
}
}
if(oneList.size() == 0) return;
// nms数据
std::vector<float> nmsList;
for(int i : oneList){
int class_index;
float zero_class_conf = output[i*det_size + 5] * output[i*det_size + 4];
float first_class_conf = output[i*det_size + 6] * output[i*det_size + 4];
float xywh[4] = {output[i*det_size + 0], output[i*det_size + 1], output[i*det_size + 2], output[i*det_size + 3]};
float xyxy[4];
xywh2xyxy(xywh, xyxy);
if(zero_class_conf > first_class_conf){
nmsList.emplace_back(xyxy[0]);
nmsList.emplace_back(xyxy[1]);
nmsList.emplace_back(xyxy[2]);
nmsList.emplace_back(xyxy[3]);
nmsList.emplace_back(zero_class_conf);
nmsList.emplace_back(0);
}else{
nmsList.emplace_back(xyxy[0]);
nmsList.emplace_back(xyxy[1]);
nmsList.emplace_back(xyxy[2]);
nmsList.emplace_back(xyxy[3]);
nmsList.emplace_back(first_class_conf);
nmsList.emplace_back(1);
}
}
std::vector<ResultDetection> resultList;
if(nmsList.size()==0) return;
else if ((nmsList.size()/PERSIZE) > MAX_OUTPUT_BBOX_COUNT)
{
// 提取前MAX_OUTPUT_BBOX_COUNT个候选框
compareAndfilter(nmsList, resultList, MAX_OUTPUT_BBOX_COUNT);
}else{
// resultList.assign(nmsList.begin(), nmsList.end());
compareAndfilter(nmsList, resultList, nmsList.size() / PERSIZE);
}
// 简单的排序
std::cout << resultList.size() << std::endl;
for(int i = 0; i < resultList.size(); i++){
auto tempDet = resultList[i];
if(tempDet.suppressed == 1) continue;
auto ix1 = tempDet.bbox[0];
auto iy1 = tempDet.bbox[1];
auto ix2 = tempDet.bbox[2];
auto iy2 = tempDet.bbox[3];
auto iarea = (ix2 - ix1) * (iy2 - iy1);
for(int j = i+1; j < resultList.size(); ++j){
auto yTempDet = resultList[j];
if(yTempDet.suppressed == 1) continue;
auto xx1 = std::max(ix1, yTempDet.bbox[0]);
auto yy1 = std::max(iy1, yTempDet.bbox[1]);
auto xx2 = std::min(ix2, yTempDet.bbox[2]);
auto yy2 = std::min(iy2, yTempDet.bbox[3]);
auto w = std::max((float)0, xx2 - xx1);
auto h = std::max((float)0, yy2 - yy1);
auto inter = w * h;
auto ovr = inter / (iarea + (yTempDet.bbox[2] - yTempDet.bbox[0])*(yTempDet.bbox[3]-yTempDet.bbox[1]) - inter);
if(ovr > nms_thresh){
resultList[j].suppressed = 1;
}
}
}
for(int i = 0; i < resultList.size() && res.size() < MAX_DET; ++i){
ResultDetection a = resultList[i];
if(!a.suppressed) res.emplace_back(a);
}
}
float getTrueCor(float cor, float size){
if(cor < 0){
return 0;
}else{
if(cor < size){
return cor;
}else{
return size;
}
}
}
cv::Rect get_rect(cv::Mat& img, float bbox[4], cv::Mat& img_letter_box, float scale) { // bbox[4] -> xmin, ymin, xmax, ymax
// 获取原图img与加了空白的预测图片img_letter_box
float gain = std::min((float)img_letter_box.cols / img.cols, (float)img_letter_box.rows / img.rows);
// 一般在加了letter box之后,较长的一边都会为满,则获取短边的pad即可
int pad_cols = int(img_letter_box.cols - img.cols*gain) / 2; // w
int pad_rows = int(img_letter_box.rows - img.rows*gain) / 2; // h
// bbox[0] = bbox[0] - pad_cols;
// bbox[2] = bbox[2] - pad_cols;
// bbox[1] = bbox[1] - pad_rows;
// bbox[3] = bbox[3] - pad_rows;
// float scale = (float) img.cols / (img_letter_box.cols - pad_cols*2);
float l, r, t, b;
l = (int)(getTrueCor((bbox[0] - pad_cols) / scale, img.cols));
r = (int)(getTrueCor((bbox[2] - pad_cols) / scale, img.cols));
t = (int)(getTrueCor((bbox[1] - pad_rows) / scale, img.rows));
b = (int)(getTrueCor((bbox[3] - pad_rows) / scale, img.rows));
// float l, r, t, b;
// float r_w = INPUT_W / (img.cols * 1.0);
// float r_h = INPUT_H / (img.rows * 1.0);
// if (r_h > r_w) {
// l = bbox[0] - bbox[2] / 2.f;
// r = bbox[0] + bbox[2] / 2.f;
// t = bbox[1] - bbox[3] / 2.f - (INPUT_H - r_w * img.rows) / 2;
// b = bbox[1] + bbox[3] / 2.f - (INPUT_H - r_w * img.rows) / 2;
// // l = l / r_w;
// // r = r / r_w;
// // t = t / r_w;
// // b = b / r_w;
// l = getTrueCor(l / r_w, img.cols);
// r = getTrueCor(r / r_w, img.cols);
// t = getTrueCor(t / r_w, img.rows);
// b = getTrueCor(b / r_w, img.rows);
// } else {
// l = bbox[0] - bbox[2] / 2.f - (INPUT_W - r_h * img.cols) / 2;
// r = bbox[0] + bbox[2] / 2.f - (INPUT_W - r_h * img.cols) / 2;
// t = bbox[1] - bbox[3] / 2.f;
// b = bbox[1] + bbox[3] / 2.f;
// // l = l / r_h;
// // r = r / r_h;
// // t = t / r_h;
// // b = b / r_h;
// l = getTrueCor(l / r_h, img.cols);
// r = getTrueCor(r / r_h, img.cols);
// t = getTrueCor(t / r_h, img.rows);
// b = getTrueCor(b / r_h, img.rows);
// }
return cv::Rect(round(l), round(t), round(r - l), round(b - t));
}
int main(){
std::string enginePath="/data/kile/202204/yolov5/log/2.engine";
// 创建文件流
std::fstream file(enginePath, std::ios_base::binary | std::ios_base::in);
if(!file.is_open()){
return 10001; // engine文件打开失败
}
nvinfer1::ICudaEngine* engine = loadEngine(file);
nvinfer1::IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);
clock_t start, end;
start = clock();
void *buffers[2];
std::vector<int64_t> bufferSize;
int nBindings = engine->getNbBindings();
bufferSize.resize(nBindings);
for(int i = 0; i < nBindings; ++i){
nvinfer1::Dims dims = engine->getBindingDimensions(i);
nvinfer1::DataType dtype = engine->getBindingDataType(i);
int64_t totalSize = volume(dims) * 1 * getElementSize(dtype);
bufferSize[i] = totalSize;
cudaMalloc(&buffers[i], totalSize);
}
int outSize = bufferSize[1] / sizeof(float);
std::string imagePath = "/data/kile/202204/yolov5/result/aircraft_4.jpg";
cv::Mat image = cv::imread(imagePath);
cv::Mat img1 = image.clone();
int size[2] = {416, 416};
int color[3] = {114, 114, 114};
float scale = letterbox(image, size, color);
cv::cvtColor(image, image, cv::COLOR_BGR2RGB);
// image.convertTo(image, CV_32F, 255.0, 0);
// time_t start,end;
// start = clock();
std::vector<float> a;
if(image.isContinuous()){
// a->assign(image.datastart, image.dataend);
a.assign(image.datastart, image.dataend);
}
float image_content[image.channels()*image.rows*image.cols];
for (int i = 0; i < image.rows*image.cols; i++)
{ // (float)(a[i*3] / 255)
image_content[i] = (float)(a[i*3] / 255);
image_content[i+image.cols*image.rows] = (float)(a[i*3+1] / 255);
image_content[i+2*image.cols*image.rows] = (float)(a[i*3+2] / 255); // 0.003
// std::cout << image_content[i] << std::endl;
}
// end = clock();
// std::cout << (float) (end - start) / CLOCKS_PER_SEC << std::endl;
cudaError_t flag;
cudaStream_t stream;
flag = cudaStreamCreate(&stream);
if (flag != cudaSuccess)
{
std::cout << "1 cudaStreamCreate error : " << flag << std::endl;
return 40001;
}
flag = cudaMemcpyAsync(buffers[0],&image_content, bufferSize[0],cudaMemcpyHostToDevice,stream);
if (flag != cudaSuccess)
{
std::cout << "2 cudaMemcpyAsync input error : " << cudaGetErrorString(flag) << std::endl;
return 40002;
}
bool status = context->enqueueV2(buffers, stream, nullptr);
if (!status){
std::cout << "4 inference error : " << status << std::endl;
return 40004;
}
float result[outSize];
flag = cudaMemcpyAsync(result, buffers[1], bufferSize[1], cudaMemcpyDeviceToHost,stream);
if (flag != cudaSuccess)
{
std::cout << "3 cudaMemcpyAsync output error : " << cudaGetErrorString(flag) << std::endl;
return 40003;
}
cudaStreamSynchronize(stream);
cudaStreamDestroy(stream);
// std::fstream f;
// f.open("/data/kile/202204/yolov5/log/imagecontent.txt", std::ios_base::out);
// for (int i = 0; i < 74529; i++)
// { // (float)(a[i*3] / 255)
// f << std::to_string(result[i]) << std::endl;
// }
// f.close();
std::vector<ResultDetection> res;
nms(res, result, outSize, 0.25, 0.45);
for (size_t j = 0; j < res.size(); j++) {
cv::Rect r = get_rect(img1, res[j].bbox, image, scale);
cv::rectangle(img1, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
cv::putText(img1, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
}
cv::imwrite("/data/kile/202204/yolov5/log/test_c.jpg", img1);
end = clock();
std::cout << "hello world!! " << (float) (end - start) / CLOCKS_PER_SEC << std::endl;
getchar();
return 1;
}
import time
from typing import List
import numpy as np
import tensorrt
import torch
from pycuda import driver
import pycuda.autoinit
from PIL import Image, ImageDraw, ImageFont
from utils.general import non_max_suppression, scale_coords
def time_sync():
# PyTorch-accurate time
if torch.cuda.is_available():
torch.cuda.synchronize()
return time.time()
def trt_pre(batch, context, d_size,
d_type): # Need to set both input and output precisions to FP16 to fully enable FP16
output = np.empty(d_size, dtype=d_type)
batch = batch.reshape(-1)
d_input = driver.mem_alloc(1 * batch.nbytes)
d_output = driver.mem_alloc(output.nbytes)
bindings = [int(d_input), int(d_output)]
stream = driver.Stream()
# Transfer input data to device
driver.memcpy_htod_async(d_input, batch, stream)
# Execute model
context.execute_async_v2(bindings, stream.handle, None)
# Transfer predictions back
driver.memcpy_dtoh_async(output, d_output, stream)
# Syncronize threads
stream.synchronize()
return output
def python_tensorrt_predict(model_path):
# 加载模型A
trt_model = tensorrt.Runtime(tensorrt.Logger(tensorrt.Logger.WARNING))
# 反序列化模型
engine = trt_model.deserialize_cuda_engine(open(model_path, "rb").read())
# 创建推理上下文
context = engine.create_execution_context()
for binding in engine:
if not engine.binding_is_input(binding):
size = tensorrt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = tensorrt.nptype(engine.get_binding_dtype(binding))
else:
input_w = engine.get_binding_shape(binding)[-1]
input_h = engine.get_binding_shape(binding)[-2]
from utils.datasets import LoadImages
# source = r"/data/kile/202204/yolov5/result/aircraft_4.jpg"
source = r"/data/kile/202204/yolov5/video/data/data_data/cd3ed3d2cf7611eca8630050569379a7.jpg"
start = time.perf_counter()
dataset = LoadImages(source, img_size=[input_w, input_h], stride=32, auto=False)
for path, im, im0s, vid_cap, s in dataset:
image = Image.open(path)
im = torch.from_numpy(im).to("cuda").float()
im /= 255
start1 = time.perf_counter()
outputs = trt_pre(np.asarray(im.cpu(), dtype=np.float32), context, size, dtype)
end1 = time.perf_counter()
print(f"inference {end1 - start1}")
# with open("log/outputs.txt", "w") as w:
# for i in outputs:
# w.write(str(i))
# w.write("\n")
outputs = torch.as_tensor(outputs).reshape((-1, 7)).unsqueeze(0)
pred = non_max_suppression(outputs, 0.25, 0.45)
pred[0][:,:4] = scale_coords(im.shape[1:], pred[0][:,:4], im0s.shape)
image = drawImage(image, list(pred))
image.save(r"log/test.jpg")
end = time.perf_counter()
print(f"{end -start}")
def drawImage(image, class_list):
font = ImageFont.truetype(font='/data/kile/other/yolov3/font/FiraMono-Medium.otf',
size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32'))
thickness = (image.size[0] + image.size[1]) // 300
for i in class_list[0]:
if not isinstance(i, List):
i = list(i)
label = str(i[-1])+"_"+str(i[-2])
box = i[:-2]
left, top, right, bottom = box
top = int(top.numpy())
left = int(left.numpy())
bottom = int(bottom.numpy())
right = int(right.numpy())
draw = ImageDraw.Draw(image)
label_size = draw.textsize(label, font)
top = max(0, np.floor(top + 0.5).astype('int32'))
left = max(0, np.floor(left + 0.5).astype('int32'))
bottom = min(image.size[1], np.floor(bottom + 0.5).astype('int32'))
right = min(image.size[0], np.floor(right + 0.5).astype('int32'))
if top - label_size[1] >= 0:
text_origin = np.array([left, top - label_size[1]])
else:
text_origin = np.array([left, top + 1])
for i in range(thickness):
draw.rectangle(
[left + i, top + i, right - i, bottom - i],
outline=(0x27, 0xC1, 0x36))
draw.rectangle(
[tuple(text_origin), tuple(text_origin + label_size)],
fill=(128, 0, 128))
draw.text(text_origin, label, fill=(0, 0, 0), font=font)
del draw
return image
if __name__ == '__main__':
# 通官方的源码export.py生成tensorrt模型
# model_path = r"/data/kile/202204/yolov5/log/2.engine"
model_path = r"/data/kile/202204/yolov5/log/1.trt"
python_tensorrt_predict(model_path)
from PIL import Image
# path = r"/data/kile/data/oridata_100/n0942195117838/n0942195117838.jpeg"
# image = cv2.imread(path)
# im, _, _ = letterbox(image, (416,416), auto=False)
# cv2.imwrite("/data/1.jpg", im)
用tensorrt c++的话速度可以比yolov5 python快1倍,时间是python的1/2