yolov5量化部署（基于openvino和tensorrt）

给算法爸爸上香

已于 2023-12-28 00:18:06 修改

阅读量3.2k

点赞数 4

分类专栏： # model deployment deep learning # detection 文章标签： yolov5 openvino tensorrt

于 2023-06-09 18:10:53 首次发布

本文链接：https://blog.csdn.net/taifyang/article/details/131039323

版权

deep learning 同时被 3 个专栏收录

111 篇文章 9 订阅

订阅专栏

model deployment

24 篇文章 4 订阅

订阅专栏

detection

23 篇文章 0 订阅

订阅专栏

yolov5 openvino量化部署

首先，下载YOLOv5源码，安装YOLOv5和OpenVINO的python依赖。

git clone https://github.com/ultralytics/yolov5.git 
pip install -r requirements.txt && pip install openvino openvino-dev

然后，通过YOLOv5提供的export.py将预训练的Pytorch模型转换为OpenVINO FP32 IR模型。

python export.py --weights yolov5n.pt --imgsz 640 --batch-size 1 --include openvino

下面的量化代码改编自：https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/111-yolov5-quantization-migration

from pathlib import Path
from utils.dataloaders import create_dataloader
from utils.general import check_dataset
from export import attempt_load, yaml_save
from val import run as validation_fn
from openvino.tools import mo
from openvino.runtime import serialize
from openvino.tools.pot.api import DataLoader
from openvino.tools.pot.engines.ie_engine import IEEngine
from openvino.tools.pot.graph import load_model
from openvino.tools.pot.pipeline.initializer import create_pipeline
from openvino.tools.pot.graph.model_utils import compress_model_weights
from openvino.tools.pot.graph import load_model, save_model


IMAGE_SIZE = 640
MODEL_NAME = "yolov5n"
DATASET_CONFIG = "./data/coco128.yaml"


class YOLOv5POTDataLoader(DataLoader):
    '''Inherit from DataLoader function and implement for YOLOv5.'''
    def __init__(self, data_source):
        super().__init__({})
        self._data_loader = data_source
        self._data_iter = iter(self._data_loader)

    def __len__(self):
        return len(self._data_loader.dataset)

    def __getitem__(self, item):
        try:
            batch_data = next(self._data_iter)
        except StopIteration:
            self._data_iter = iter(self._data_loader)
            batch_data = next(self._data_iter)

        im, target, path, shape = batch_data

        im = im.float()
        im /= 255
        nb, _, height, width = im.shape
        img = im.cpu().detach().numpy()
        target = target.cpu().detach().numpy()

        annotation = dict()
        annotation["image_path"] = path
        annotation["target"] = target
        annotation["batch_size"] = nb
        annotation["shape"] = shape
        annotation["width"] = width
        annotation["height"] = height
        annotation["img"] = img

        return (item, annotation), img


if __name__ == "__main__":
    '''Conversion of the YOLOv5 model to OpenVINO'''
    onnx_path = f"./{MODEL_NAME}.onnx"

    # fp32 IR model
    fp32_path = f"./FP32_openvino_model/{MODEL_NAME}_fp32.xml"

    print(f"Export ONNX to OpenVINO FP32 IR to: {fp32_path}")
    model = mo.convert_model(onnx_path)
    serialize(model, fp32_path)

    # fp16 IR model
    fp16_path = f"./FP16_openvino_model/{MODEL_NAME}_fp16.xml"

    print(f"Export ONNX to OpenVINO FP16 IR to: {fp16_path}")
    model = mo.convert_model(onnx_path, compress_to_fp16=True)
    serialize(model, fp16_path)

    '''Prepare dataset for quantization'''
    data = check_dataset(DATASET_CONFIG)
    data_source = create_dataloader(data["val"], imgsz=640, batch_size=1, stride=32, pad=0.5, workers=0)[0]
    pot_data_loader = YOLOv5POTDataLoader(data_source)

    '''Configure quantization pipeline'''
    algorithms_config = [
        {
            "name": "DefaultQuantization",
            "params": {
                "preset": "mixed",
                "stat_subset_size": 300,
                "target_device": "CPU"
            },
        }
    ]

    engine_config = {"device": "CPU"}

    model_config = {
        "model_name": f"{MODEL_NAME}",
        "model": fp32_path,
        "weights": fp32_path.replace(".xml", ".bin"),
    }

    pot_model = load_model(model_config)

    engine = IEEngine(config=engine_config, data_loader=pot_data_loader)

    pipeline = create_pipeline(algorithms_config, engine)

    '''Perform model optimization'''
    compressed_model = pipeline.run(pot_model)
    compress_model_weights(compressed_model)
    optimized_save_dir = Path(f"./POT_INT8_openvino_model/")
    save_model(compressed_model, optimized_save_dir, model_config["model_name"] + "_int8")
    pot_int8_path = f"{optimized_save_dir}/{MODEL_NAME}_int8.xml"

    '''Compare accuracy FP32 and INT8 models'''
    model = attempt_load(f"./{MODEL_NAME}.pt", device="cpu", inplace=True, fuse=True) 
    metadata = {"stride": int(max(model.stride)), "names": model.names}  # model metadata
    yaml_save(Path(pot_int8_path).with_suffix(".yaml"), metadata)
    yaml_save(Path(fp32_path).with_suffix(".yaml"), metadata)

    print("Checking the accuracy of the original model:")
    fp32_metrics = validation_fn(
        data=DATASET_CONFIG,
        weights=Path(fp32_path).parent,
        batch_size=1,
        workers=0,
        plots=False,
        device="cpu",
        iou_thres=0.65,
    )

    fp32_ap5 = fp32_metrics[0][2]
    fp32_ap_full = fp32_metrics[0][3]
    print(f"mAP@.5 = {fp32_ap5}")
    print(f"mAP@.5:.95 = {fp32_ap_full}")

    print("Checking the accuracy of the POT int8 model:")
    int8_metrics = validation_fn(
        data=DATASET_CONFIG,
        weights=Path(pot_int8_path).parent,
        batch_size=1,
        workers=0,
        plots=False,
        device="cpu",
        iou_thres=0.65,
    )

    pot_int8_ap5 = int8_metrics[0][2]
    pot_int8_ap_full = int8_metrics[0][3]
    print(f"mAP@.5 = {pot_int8_ap5}")
    print(f"mAP@.5:.95 = {pot_int8_ap_full}")

python推理：

import cv2
import numpy as np
from openvino.inference_engine import IECore


names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
        'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
        'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
        'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
        'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
        'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
        'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
        'cell phone','microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
        'teddy bear', 'hair drier', 'toothbrush']
conf_thres = 0.5
nms_thres = 0.5
model_path = "yolov5n_fp32.onnx" #onnx推理支持fp32和fp16
model_xml = "./POT_INT8_openvino_model/yolov5n_int8.xml" #IR推理支持fp32、fp16和int8
model_bin = "./POT_INT8_openvino_model/yolov5n_int8.bin"
    
    
def letterbox(img, new_shape=(640, 640), color=(114, 114, 114), scaleup=False, stride=32):
    shape = img.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # only scale down, do not scale up (for better test mAP)
        r = min(r, 1.0)
    ratio = r  # width, height ratios
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
    dw /= 2
    dh /= 2
    if shape[::-1] != new_unpad:  # resize
        img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return img,ratio,(dw,dh)


def iou(b1,b2):
    b1_x1, b1_y1, b1_x2, b1_y2 = b1[0], b1[1], b1[2], b1[3]
    b2_x1, b2_y1, b2_x2, b2_y2 = b2[:,0], b2[:,1], b2[:,2], b2[:,3]

    inter_rect_x1 = np.maximum(b1_x1, b2_x1)
    inter_rect_y1 = np.maximum(b1_y1, b2_y1)
    inter_rect_x2 = np.minimum(b1_x2, b2_x2)
    inter_rect_y2 = np.minimum(b1_y2, b2_y2)

    inter_area = np.maximum(inter_rect_x2 - inter_rect_x1, 0) * np.maximum(inter_rect_y2 - inter_rect_y1, 0)

    area_b1 = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
    area_b2 = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)

    iou = inter_area / np.maximum((area_b1+area_b2-inter_area),1e-6)
    return iou


def non_max_suppression(boxes, conf_thres=0.5, nms_thres=0.4, ratio=1, pad=(20,20)):
    # 取出batch_size
    bs = np.shape(boxes)[0]
    # xywh___ to____ xyxy
    shape_boxes = np.zeros_like(boxes[:,:,:4])
    shape_boxes[:, :, 0] = boxes[:, :, 0] - boxes[:, :, 2] / 2
    shape_boxes[:, :, 1] = boxes[:, :, 1] - boxes[:, :, 3] / 2
    shape_boxes[:, :, 2] = boxes[:, :, 0] + boxes[:, :, 2] / 2
    shape_boxes[:, :, 3] = boxes[:, :, 1] + boxes[:, :, 3] / 2
    boxes[:, :, :4] = shape_boxes
    boxes[:, :, 5:] *= boxes[:, :, 4:5]

    # output存放每一张图片的预测结果，推理阶段一般是一张图片
    output = []
    for i in range(bs):
        predictions = boxes[i]  # 预测位置xyxy  shape==(12700,85)
        score = np.max(predictions[:, 5:], axis=-1)
        # score = predictions[:,4]  # 存在物体置信度,shape==12700
        mask = score > conf_thres  # 物体置信度阈值mask==[False,False,True......],shape==12700,True将会被保留，False列将会被删除
        detections = predictions[mask]  # 第一次筛选  shape==(115,85)
        class_conf = np.expand_dims(np.max(detections[:,5:],axis=-1),axis=-1)  # 获取每个预测框预测的类别置信度
        class_pred = np.expand_dims(np.argmax(detections[:,5:],axis=-1),axis=-1)  # 获取每个预测框的类别下标
        # 结果堆叠，(num_boxes,位置信息4+包含物体概率1+类别置信度1+类别序号1)
        detections = np.concatenate([detections[:,:4],class_conf,class_pred],axis=-1)  # shape=(numbox,7)

        unique_class = np.unique(detections[:,-1])  # 取出包含的所有类别
        if len(unique_class)==0:
            continue
        best_box = []
        for c in unique_class:
            # 取出类别为c的预测结果
            cls_mask = detections[:,-1] == c
            detection = detections[cls_mask] # shape=(82,7)

            # 包含物体类别概率从高至低排列
            scores = detection[:,4]
            arg_sort = np.argsort(scores)[::-1]  # 返回的是索引
            detection = detection[arg_sort]

            while len(detection) != 0:
                best_box.append(detection[0])
                if len(detection) == 1:
                    break
                # 计算当前置信度最大的框和其它预测框的iou
                ious = iou(best_box[-1],detection[1:])
                detection = detection[1:][ious < nms_thres]  # 小于nms_thres将被保留，每一轮至少减少一个
        output.append(best_box)

    boxes_loc = []
    conf_loc = []
    class_loc = []
    if len(output):
        for i in range(len(output)):
            pred = output[i]
            for i, det in enumerate(pred):
                if len(det):
                    # 将框坐标调整回原始图像中
                    det[0] = (det[0] - pad[0]) / ratio
                    det[2] = (det[2] - pad[0]) / ratio
                    det[1] = (det[1] - pad[1]) / ratio
                    det[3] = (det[3] - pad[1]) / ratio
                    boxes_loc.append([det[0],det[1],det[2],det[3]])
                    conf_loc.append(det[4])
                    class_loc.append(det[5])
    return boxes_loc,conf_loc,class_loc

def plot_box(img,boxes,conf,clas_id,line_thickness=3,names=None):
    # 画位置框
    tl = line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1
    c1, c2 = (int(boxes[0]), int(boxes[1])), (int(boxes[2]),int(boxes[3]))
    cv2.rectangle(img, c1, c2, [0, 0 ,255], thickness=tl, lineType=cv2.LINE_AA)

    # 画类别信息框
    label = f'{names[int(clas_id)]} {conf:.2f}'
    tf = max(tl - 1, 1)  # label字体的线宽 font thickness
    t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
    c2 = (c1[0] + t_size[0], c1[1] - t_size[1] - 3)
    cv2.rectangle(img, c1, c2, [255, 0 ,0], -1, cv2.LINE_AA)
    cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)


if __name__ == '__main__':
    ie = IECore()
    #net = ie.read_network(model=model_path)
    net = ie.read_network(model=model_xml, weights=model_bin)
    exec_net = ie.load_network(network=net, device_name="CPU")
    input_layer = next(iter(net.input_info))
    frame = cv2.imread("bus.jpg") 
    img, ratio, (dw,dh) = letterbox(frame)

    blob = cv2.dnn.blobFromImage(np.ascontiguousarray(img), 1/255.0, (img.shape[0], img.shape[1]), swapRB=True, crop=False)
    infer_request_handle=exec_net.start_async(request_id=0,inputs={input_layer: blob})
    if infer_request_handle.wait(-1) == 0:
        res = infer_request_handle.output_blobs["output0"]
        outs = res.buffer
        boxes_loc,conf_loc,class_loc = non_max_suppression(outs, conf_thres=conf_thres, nms_thres=nms_thres,ratio=ratio, pad=(dw,dh))

        for i in range(len(boxes_loc)):
            boxes = boxes_loc[i]
            conf = conf_loc[i]
            clas_id = class_loc[i]
            plot_box(frame, boxes, conf, clas_id, line_thickness=3, names=names)

    cv2.imshow("result", frame)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

或者利用yolov5自带的detect.py：

python detect.py --weights ./POT_INT8_openvino_model

C++推理：（参考基于OpenVNO C++ API部署YOLOv5模型）

#include <iostream>
#include <string>
#include <openvino/openvino.hpp> 
#include <opencv2/opencv.hpp>   


/* ---------  Please modify the path of yolov5 model and image -----------*/
std::string model_file = "yolov5n_int8.xml";
 //可以改成yolov5n_fp16.xml或yolov5n_fp32.xml或yolov5n_fp32.onnx（支持fp32、fp16、int8的IR模型推理和fp32的onnx模型推理）
std::string image_file = "bus.jpg";

const std::vector<std::string> class_names = {
	"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
	"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
	"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
	"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
	"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
	"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
	"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
	"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
	"hair drier", "toothbrush" };


cv::Mat letterbox(cv::Mat& img, std::vector<float>& paddings, std::vector<int> new_shape = { 640, 640 })
{
	// Get current image shape [height, width]
	int img_h = img.rows;
	int img_w = img.cols;

	// Compute scale ratio(new / old) and target resized shape
	float scale = std::min(new_shape[1] * 1.0 / img_h, new_shape[0] * 1.0 / img_w);
	int resize_h = int(round(img_h * scale));
	int resize_w = int(round(img_w * scale));
	paddings[0] = scale;

	// Compute padding
	int pad_h = new_shape[1] - resize_h;
	int pad_w = new_shape[0] - resize_w;

	// Resize and pad image while meeting stride-multiple constraints
	cv::Mat resized_img;
	cv::resize(img, resized_img, cv::Size(resize_w, resize_h));

	// divide padding into 2 sides
	float half_h = pad_h * 1.0 / 2;
	float half_w = pad_w * 1.0 / 2;
	paddings[1] = half_h;
	paddings[2] = half_w;

	// Compute padding boarder
	int top = int(round(half_h - 0.1));
	int bottom = int(round(half_h + 0.1));
	int left = int(round(half_w - 0.1));
	int right = int(round(half_w + 0.1));

	// Add border
	cv::copyMakeBorder(resized_img, resized_img, top, bottom, left, right, 0, cv::Scalar(114, 114, 114));

	return resized_img;
}


int main(int argc, char* argv[]) 
{
	// -------- Get OpenVINO runtime version --------
	std::cout << ov::get_openvino_version().description << ':' << ov::get_openvino_version().buildNumber << std::endl;

	// -------- Step 1. Initialize OpenVINO Runtime Core --------
	ov::Core core;

	// -------- Step 2. Compile the Model --------
	auto compiled_model = core.compile_model(model_file, "CPU"); 

	// -------- Step 3. Create an Inference Request --------
	ov::InferRequest infer_request = compiled_model.create_infer_request();

	clock_t start = clock();
	// -------- Step 4. Read a picture file and do the preprocess --------
	cv::Mat img = cv::imread(image_file); //Load a picture into memory
	std::vector<float> paddings(3);       //scale, half_h, half_w
	cv::Mat resized_img = letterbox(img, paddings); //resize to (640,640) by letterbox
	cv::Mat blob = cv::dnn::blobFromImage(resized_img, 1 / 255.0, cv::Size(640, 640), cv::Scalar(0, 0, 0), true); 	// BGR->RGB, u8(0-255)->f32(0.0-1.0), HWC->NCHW

	// -------- Step 5. Feed the blob into the input node of YOLOv5 -------	
	auto input_port = compiled_model.input(); // Get input port for model with one input
	ov::Tensor input_tensor(input_port.get_element_type(), input_port.get_shape(), blob.ptr(0)); // Create tensor from external memory
	infer_request.set_input_tensor(input_tensor); // Set input tensor for model with one input

	// -------- Step 6. Start inference --------
	for (size_t i = 0; i < 100; i++)
		infer_request.infer();

	// -------- Step 7. Get the inference result --------
	auto output = infer_request.get_output_tensor(0);
	auto output_shape = output.get_shape();
	std::cout << "The shape of output tensor:" << output_shape << std::endl;
	cv::Mat output_buffer(output_shape[1], output_shape[2], CV_32F, output.data()); 

	// -------- Step 8. Post-process the inference result -----------
	float conf_threshold = 0.25;
	float nms_threshold = 0.5;
	std::vector<cv::Rect> boxes;
	std::vector<int> class_ids;
	std::vector<float> class_scores;
	std::vector<float> confidences;
	for (int i = 0; i < output_buffer.rows; i++) 
	{
		float confidence = output_buffer.at<float>(i, 4);
		if (confidence < conf_threshold) 
			continue;

		cv::Mat classes_scores = output_buffer.row(i).colRange(5, 85);
		cv::Point class_id;
		double score;
		cv::minMaxLoc(classes_scores, NULL, &score, NULL, &class_id);

		if (score > 0.25)
		{
			float cx = output_buffer.at<float>(i, 0);
			float cy = output_buffer.at<float>(i, 1);
			float w = output_buffer.at<float>(i, 2);
			float h = output_buffer.at<float>(i, 3);
			int left = static_cast<int>((cx - 0.5 * w - paddings[2]) / paddings[0]);
			int top = static_cast<int>((cy - 0.5 * h - paddings[1]) / paddings[0]);
			int width = static_cast<int>(w / paddings[0]);
			int height = static_cast<int>(h / paddings[0]);
			cv::Rect box;
			box.x = left;
			box.y = top;
			box.width = width;
			box.height = height;

			boxes.push_back(box);
			class_ids.push_back(class_id.x);
			class_scores.push_back(score);
			confidences.push_back(confidence);
		}
	}

	// NMS
	std::vector<int> indices;
	cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, nms_threshold, indices);

	clock_t end = clock();
	std::cout << end - start << std::endl;

	// -------- Step 8. Visualize the detection results -----------
	for (size_t i = 0; i < indices.size(); i++) 
	{
		int index = indices[i];
		int class_id = class_ids[index];
		cv::rectangle(img, boxes[index], cv::Scalar(0, 0, 255), 2, 8);
		std::string label = class_names[class_id] + ":" + std::to_string(class_scores[index]);
		cv::putText(img, label, cv::Point(boxes[index].tl().x, boxes[index].tl().y - 10), cv::FONT_HERSHEY_SIMPLEX, .5, cv::Scalar(255, 0, 0));
	}

	cv::imshow("YOLOv5 OpenVINO Inference C++ Demo", img);
	cv::waitKey(0);

	return 0;
}

C++在i7-12700 CPU下推理fp32、fp16、int8模型循环100轮，耗时如下（各跑3次）：
yolov5n_fp32：1599ms 2040ms 1514ms
yolov5n_fp16：1505ms 2078ms 1514ms
yolov5n_int8： 856ms 861ms 852ms

fp32和fp16模型推理耗时差不多，int8能缩短推理耗时到一半左右。

yolov5 tensorrt量化部署

方法一：wts转trt的硬解析方案
这个借助的是大佬的工程：https://github.com/wang-xinyu/tensorrtx/tree/master/yolov5
python和c++的推理代码都有。不得不佩服人家写的确实很详细了，有这么好的轮子干嘛不直接拿来用呢，哈哈。
配置环境的过程还参考了博文：windows上配置TensorRT yolov5 -6.0部署 tensorrtx视频流推理
LZ测试了一下yolov5各种模型单张图片的推理耗时如下（C++，RTX3070 gpu ）：
yolov5n-int8： 2ms 1ms
yolov5s-int8： 2ms 1ms
yolov5m-int8：3ms 2ms
yolov5l-int8： 4ms 3ms
yolov5x-int8 ：7ms 6ms

yolov5n-fp16： 1ms 1ms
yolov5s-fp16： 2ms 2ms
yolov5m-fp16：4ms 3ms
yolov5l-fp16： 6ms 5ms
yolov5x-fp16：10ms 9ms

yolov5n-fp32： 424ms 2ms
yolov5s-fp32： 389ms 4ms
yolov5m-fp32：401ms 9ms
yolov5l-fp32： 422ms 17ms
yolov5x-fp32： 30ms 28ms
其中在我的机器上，fp32模型的yolov5n-yolov5l版本首次推理时间较长，不清楚是什么原因。

方法二：onnx解析tensorrt 的api方案
onnx转tensorrt engine int8量化：
python代码（参考https://github.com/Wulingtian/yolov5_tensorrt_int8_tools）
calibrator.py

import os
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import logging
logger = logging.getLogger(__name__)


# calibrator
class Calibrator(trt.IInt8EntropyCalibrator2):
    def __init__(self, stream, cache_file=""):
        trt.IInt8EntropyCalibrator2.__init__(self)       
        self.stream = stream
        self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes)
        self.cache_file = cache_file

    def get_batch_size(self):
        return self.stream.batch_size

    def get_batch(self, names):
        batch = self.stream.next_batch()
        if not batch.size:   
            return None
        cuda.memcpy_htod(self.d_input, batch)
        return [int(self.d_input)]

    def read_calibration_cache(self):
        if os.path.exists(self.cache_file):
            with open(self.cache_file, "rb") as f:
                logger.info("Using calibration cache to save time: {:}".format(self.cache_file))
                return f.read()

    def write_calibration_cache(self, cache):
        with open(self.cache_file, "wb") as f:
            logger.info("Caching calibration data for future use: {:}".format(self.cache_file))
            f.write(cache)

convert_trt_quant.py

import glob, os, cv2
import numpy as np
import tensorrt as trt
from calibrator import Calibrator


height = 640
width = 640
image_path = 'images'
model_path = "yolov5n_fp32.onnx"
engine_model_path = "models_save/yolov5n_int8.trt"
calibration_table = 'models_save/yolov5n_calibration.cache'

TRT_LOGGER = trt.Logger(trt.Logger.WARNING) 


def preprocess(image):
    h, w, c = image.shape
    r_w = width / w
    r_h = height / h
    if r_h > r_w:
        tw = width
        th = int(r_w * h)
        tx1 = tx2 = 0
        ty1 = int((height - th) / 2)
        ty2 = height - th - ty1
    else:
        tw = int(r_h * w)
        th = height
        tx1 = int((width - tw) / 2)
        tx2 = width - tw - tx1
        ty1 = ty2 = 0
    image = cv2.resize(image, (tw, th))
    image = cv2.copyMakeBorder(image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, (128, 128, 128))
    image = image[:, :, ::-1].transpose(2, 0, 1).astype(dtype=np.float32)  #BGR2RGB和HWC2CHW
    image = image / 255.0
    return image


class DataLoader:
    def __init__(self):
        self.index = 0
        self.length = 8
        self.batch_size = 16
        self.img_list = glob.glob(os.path.join(image_path, "*.jpg"))
        assert len(self.img_list) >= self.batch_size * self.length
        self.calibration_data = np.zeros((self.batch_size, 3, height, width), dtype=np.float32)

    def next_batch(self):
        if self.index < self.length:
            for i in range(self.batch_size):
                assert os.path.exists(self.img_list[i + self.index * self.batch_size]), 'not found!!'
                img = cv2.imread(self.img_list[i + self.index * self.batch_size])
                img = preprocess(img)
                self.calibration_data[i] = img
            self.index += 1
            return np.ascontiguousarray(self.calibration_data, dtype=np.float32)
        else:
            return np.array([])

    def __len__(self):
        return self.length


def get_engine(onnx_file_path="", engine_file_path="", calibration_stream=None, calibration_table_path=""):
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(1) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:      
        if not os.path.exists(onnx_file_path):
            quit('ONNX file {} not found'.format(onnx_file_path))
        with open(onnx_file_path, 'rb') as model:
            parser.parse(model.read())
            assert network.num_layers > 0, 'Failed to parse ONNX model. Please check if the ONNX model is compatible '      
        builder.max_batch_size = 1
        config = builder.create_builder_config()
        config.max_workspace_size = 1 << 30
        config.set_flag(trt.BuilderFlag.INT8)
        assert calibration_stream, 'Error: a calibration_stream should be provided for int8 mode'
        config.int8_calibrator  = Calibrator(calibration_stream, calibration_table_path)
        runtime = trt.Runtime(TRT_LOGGER)
        plan = builder.build_serialized_network(network, config)
        engine = runtime.deserialize_cuda_engine(plan)
        if engine is None:
            print('Failed to create the engine')
        with open(engine_file_path, "wb") as f:
            f.write(engine.serialize())


if __name__ == '__main__':
    get_engine(model_path, engine_model_path, calibration_stream=DataLoader(), calibration_table_path=calibration_table)

cpp代码
calibrator.h

#ifndef ENTROPY_CALIBRATOR_H
#define ENTROPY_CALIBRATOR_H

#include <NvInfer.h>
#include <string>
#include <vector>	


//! \class Int8EntropyCalibrator2
//!
//! \brief Implements Entropy calibrator 2.
//!  CalibrationAlgoType is kENTROPY_CALIBRATION_2.
//!
class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2
{
public:
    Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache = true);

    virtual ~Int8EntropyCalibrator2();
    int getBatchSize() const noexcept override;
    bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept override;
    const void* readCalibrationCache(size_t& length) noexcept override;
    void writeCalibrationCache(const void* cache, size_t length) noexcept override;

private:
    int batchsize_;
    int input_w_;
    int input_h_;
    int img_idx_;
    std::string img_dir_;
    std::vector<std::string> img_files_;
    size_t input_count_;
    std::string calib_table_name_;
    const char* input_blob_name_;
    bool read_cache_;
    void* device_input_;
    std::vector<char> calib_cache_;
};

#endif // ENTROPY_CALIBRATOR_H

calibrator.cpp

#include <iostream>
#include <iterator>
#include <fstream>
#include <opencv2/opencv.hpp>
#include "calibrator.h"


Int8EntropyCalibrator2::Int8EntropyCalibrator2(int batchsize, int input_w, int input_h, const char* img_dir, const char* calib_table_name, const char* input_blob_name, bool read_cache)
    : batchsize_(batchsize)
    , input_w_(input_w)
    , input_h_(input_h)
    , img_idx_(0)
    , img_dir_(img_dir)
    , calib_table_name_(calib_table_name)
    , input_blob_name_(input_blob_name)
    , read_cache_(read_cache)
{
    input_count_ = 3 * input_w * input_h * batchsize;
	cudaMalloc(&device_input_, input_count_ * sizeof(float));
	cv::glob(img_dir, img_files_, false);
}

Int8EntropyCalibrator2::~Int8EntropyCalibrator2()
{
	cudaFree(device_input_);
}

int Int8EntropyCalibrator2::getBatchSize() const noexcept
{
    return batchsize_;
}

bool Int8EntropyCalibrator2::getBatch(void* bindings[], const char* names[], int nbBindings) noexcept
{
    if (img_idx_ + batchsize_ > (int)img_files_.size()) {
        return false;
    }

    std::vector<cv::Mat> input_imgs_;
    for (int i = img_idx_; i < img_idx_ + batchsize_; i++) {
        std::cout << img_files_[i] << "  " << i << std::endl;
        cv::Mat temp = cv::imread(img_files_[i]);
        if (temp.empty()){
            std::cerr << "Fatal error: image cannot open!" << std::endl;
            return false;
        }

		int w, h, x, y;
		float r_w = input_w_ / (temp.cols * 1.0);
		float r_h = input_h_ / (temp.rows * 1.0);
		if (r_h > r_w) {
			w = input_w_;
			h = r_w * temp.rows;
			x = 0;
			y = (input_h_ - h) / 2;
		}
		else {
			w = r_h * temp.cols;
			h = input_h_;
			x = (input_w_ - w) / 2;
			y = 0;
		}
		cv::Mat re(h, w, CV_8UC3);
		cv::resize(temp, re, re.size(), 0, 0, cv::INTER_LINEAR);
		cv::Mat pr_img(input_h_, input_w_, CV_8UC3, cv::Scalar(128, 128, 128));
		re.copyTo(pr_img(cv::Rect(x, y, re.cols, re.rows)));

        input_imgs_.push_back(pr_img);
    }
    img_idx_ += batchsize_;
    cv::Mat blob = cv::dnn::blobFromImages(input_imgs_, 1.0 / 255.0, cv::Size(input_w_, input_h_), cv::Scalar(0, 0, 0), true, false);

	cudaMemcpy(device_input_, blob.ptr<float>(0), input_count_ * sizeof(float), cudaMemcpyHostToDevice);
    assert(!strcmp(names[0], input_blob_name_));
    bindings[0] = device_input_;
    return true;
}

const void* Int8EntropyCalibrator2::readCalibrationCache(size_t& length) noexcept
{
    std::cout << "reading calib cache: " << calib_table_name_ << std::endl;
    calib_cache_.clear();
    std::ifstream input(calib_table_name_, std::ios::binary);
    input >> std::noskipws;
    if (read_cache_ && input.good())
    {
        std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(calib_cache_));
    }
    length = calib_cache_.size();
    return length ? calib_cache_.data() : nullptr;
}

void Int8EntropyCalibrator2::writeCalibrationCache(const void* cache, size_t length) noexcept
{
    std::cout << "writing calib cache: " << calib_table_name_ << " size: " << length << std::endl;
    std::ofstream output(calib_table_name_, std::ios::binary);
    output.write(reinterpret_cast<const char*>(cache), length);
}

main.cpp

#include <NvInfer.h> // 编译用的头文件
#include <NvOnnxParser.h> // onnx解析器的头文件
#include <NvInferRuntime.h> // 推理用的运行时头文件
#include <cuda_runtime.h> // cuda include
#include <iostream>
#include <assert.h>
#include "calibrator.h"


#define USE_INT8 


inline const char* severity_string(nvinfer1::ILogger::Severity t)
{
	switch (t)
	{
	case nvinfer1::ILogger::Severity::kINTERNAL_ERROR: return "internal_error";
	case nvinfer1::ILogger::Severity::kERROR:   return "error";
	case nvinfer1::ILogger::Severity::kWARNING: return "warning";
	case nvinfer1::ILogger::Severity::kINFO:    return "info";
	case nvinfer1::ILogger::Severity::kVERBOSE: return "verbose";
	default: return "unknow";
	}
}


class TRTLogger : public nvinfer1::ILogger
{
public:
	virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override
	{
		if (severity <= Severity::kINFO)
		{
			if (severity == Severity::kWARNING)
				printf("\033[33m%s: %s\033[0m\n", severity_string(severity), msg);
			else if (severity <= Severity::kERROR)
				printf("\033[31m%s: %s\033[0m\n", severity_string(severity), msg);
			else
				printf("%s: %s\n", severity_string(severity), msg);
		}
	}
} logger;


bool build_model()
{
	TRTLogger logger;

	// ----------------------------- 1. 定义 builder, config 和network -----------------------------
	nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(logger);
	nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
	nvinfer1::INetworkDefinition* network = builder->createNetworkV2(1);

	// ----------------------------- 2. 输入，模型结构和输出的基本信息 -----------------------------
	// 通过onnxparser解析的结果会填充到network中，类似addConv的方式添加进去
	nvonnxparser::IParser* parser = nvonnxparser::createParser(*network, logger);
	if (!parser->parseFromFile("yolov5n_fp32.onnx", 1))
	{
		printf("Failed to parser onnx\n");
		return false;
	}

	int maxBatchSize = 1;
	printf("Workspace Size = %.2f MB\n", (1 << 30) / 1024.0f / 1024.0f);
	config->setMaxWorkspaceSize(1 << 30);

#if defined USE_FP16
	config->setFlag(nvinfer1::BuilderFlag::kFP16);
#elif defined USE_INT8
	assert(builder->platformHasFastInt8());
	config->setFlag(nvinfer1::BuilderFlag::kINT8);
	Int8EntropyCalibrator2 * calibrator = new Int8EntropyCalibrator2(1, 640, 640, "./images/", "int8calib.table", "images");
	config->setInt8Calibrator(calibrator);
#endif

	// --------------------------------- 2.1 关于profile ----------------------------------
	// 如果模型有多个输入，则必须多个profile
	auto profile = builder->createOptimizationProfile();
	auto input_tensor = network->getInput(0);
	int input_channel = input_tensor->getDimensions().d[1];
	auto input_dims = input_tensor->getDimensions();

	// 配置输入的最小、最优、最大的范围
	input_dims.d[0] = 1;
	profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMIN, input_dims);
	profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kOPT, input_dims);
	input_dims.d[0] = maxBatchSize;
	profile->setDimensions(input_tensor->getName(), nvinfer1::OptProfileSelector::kMAX, input_dims);

	// 添加到配置
	config->addOptimizationProfile(profile);

	nvinfer1::ICudaEngine * engine = builder->buildEngineWithConfig(*network, *config);
	if (engine == nullptr)
	{
		printf("Build engine failed.\n");
		return false;
	}

	// -------------------------- 3. 序列化 ----------------------------------
	// 将模型序列化，并储存为文件
	nvinfer1::IHostMemory* model_data = engine->serialize();
	FILE* f = fopen("yolov5n_int8.trt", "wb");
	fwrite(model_data->data(), 1, model_data->size(), f);
	fclose(f);

	// 卸载顺序按照构建顺序倒序
	model_data->destroy();
	parser->destroy();
	engine->destroy();
	network->destroy();
	config->destroy();
	builder->destroy();

	return true;
}


int main()
{
	build_model();
	return 0;
}

tensorrt推理代码：
python代码

import cv2
import numpy as np
import tensorrt as trt
import pycuda.autoinit 
import pycuda.driver as cuda  


class_names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
        'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
        'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
        'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
        'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
        'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
        'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
        'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
        'hair drier', 'toothbrush'] #coco80类别      
input_shape = (640, 640) 
score_threshold = 0.2  
nms_threshold = 0.5
confidence_threshold = 0.2   


def nms(boxes, scores, score_threshold, nms_threshold):
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    areas = (y2 - y1 + 1) * (x2 - x1 + 1)
    keep = []
    index = scores.argsort()[::-1] 

    while index.size > 0:
        i = index[0]
        keep.append(i)
        x11 = np.maximum(x1[i], x1[index[1:]]) 
        y11 = np.maximum(y1[i], y1[index[1:]])
        x22 = np.minimum(x2[i], x2[index[1:]])
        y22 = np.minimum(y2[i], y2[index[1:]])
        w = np.maximum(0, x22 - x11 + 1)                              
        h = np.maximum(0, y22 - y11 + 1) 
        overlaps = w * h
        ious = overlaps / (areas[i] + areas[index[1:]] - overlaps)
        idx = np.where(ious <= nms_threshold)[0]
        index = index[idx + 1]
    return keep


def xywh2xyxy(x):
    y = np.copy(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2
    y[:, 1] = x[:, 1] - x[:, 3] / 2
    y[:, 2] = x[:, 0] + x[:, 2] / 2
    y[:, 3] = x[:, 1] + x[:, 3] / 2
    return y


def filter_box(outputs): #过滤掉无用的框    
    outputs = np.squeeze(outputs).astype(dtype=np.float32)
    outputs = outputs[outputs[..., 4] > confidence_threshold]
    classes_scores = outputs[..., 5:]
     
    boxes = []
    scores = []
    class_ids = []
    for i in range(len(classes_scores)):
        class_id = np.argmax(classes_scores[i])
        outputs[i][4] *= classes_scores[i][class_id]
        outputs[i][5] = class_id
        if outputs[i][4] > score_threshold:
            boxes.append(outputs[i][:6])
            scores.append(outputs[i][4])
            class_ids.append(outputs[i][5])
            
    boxes = np.array(boxes)
    boxes = xywh2xyxy(boxes)
    scores = np.array(scores)
    indices = nms(boxes, scores, score_threshold, nms_threshold) 
    output = boxes[indices]
    return output


def letterbox(im, new_shape=(416, 416), color=(114, 114, 114)):
    # Resize and pad image while meeting stride-multiple constraints
    shape = im.shape[:2]  # current shape [height, width]

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    
    # Compute padding
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))    
    dw, dh = (new_shape[1] - new_unpad[0])/2, (new_shape[0] - new_unpad[1])/2  # wh padding 
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    
    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return im


def scale_boxes(input_shape, boxes, shape):
    # Rescale boxes (xyxy) from input_shape to shape
    gain = min(input_shape[0] / shape[0], input_shape[1] / shape[1])  # gain  = old / new
    pad = (input_shape[1] - shape[1] * gain) / 2, (input_shape[0] - shape[0] * gain) / 2  # wh padding
    boxes[..., [0, 2]] -= pad[0]  # x padding
    boxes[..., [1, 3]] -= pad[1]  # y padding
    boxes[..., :4] /= gain
    boxes[..., [0, 2]] = boxes[..., [0, 2]].clip(0, shape[1])  # x1, x2
    boxes[..., [1, 3]] = boxes[..., [1, 3]].clip(0, shape[0])  # y1, y2
    return boxes


def draw(image, box_data):
    box_data = scale_boxes(input_shape, box_data, image.shape)
    boxes = box_data[...,:4].astype(np.int32) 
    scores = box_data[...,4]
    classes = box_data[...,5].astype(np.int32)
   
    for box, score, cl in zip(boxes, scores, classes):
        top, left, right, bottom = box
        cv2.rectangle(image, (top, left), (right, bottom), (255, 0, 0), 1)
        cv2.putText(image, '{0} {1:.2f}'.format(class_names[cl], score), (top, left), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 1)


if __name__=="__main__":
    logger = trt.Logger(trt.Logger.WARNING)
    with open("yolov5n_int8.engine", "rb") as f, trt.Runtime(logger) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
    context = engine.create_execution_context()
    inputs_host = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32)
    outputs_host = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32)
    inputs_device = cuda.mem_alloc(inputs_host.nbytes)
    outputs_device = cuda.mem_alloc(outputs_host.nbytes)
    stream = cuda.Stream()
    
    image = cv2.imread('bus.jpg')
    input = letterbox(image, input_shape)
    input = input[:, :, ::-1].transpose(2, 0, 1).astype(dtype=np.float32)  #BGR2RGB和HWC2CHW
    input = input / 255.0
    input = np.expand_dims(input, axis=0)
    np.copyto(inputs_host, input.ravel())

    with engine.create_execution_context() as context:
        cuda.memcpy_htod_async(inputs_device, inputs_host, stream)
        context.execute_async_v2(bindings=[int(inputs_device), int(outputs_device)], stream_handle=stream.handle)
        cuda.memcpy_dtoh_async(outputs_host, outputs_device, stream)
        stream.synchronize()  
        boxes = filter_box(outputs_host.reshape(context.get_binding_shape(1)))
        draw(image, boxes)
        cv2.imwrite('result.jpg', image)

cpp代码

#include <iostream>
#include <fstream>
#include <vector>
#include <opencv2/opencv.hpp>
#include <cuda_runtime.h>
#include <NvInfer.h>
#include <NvInferRuntime.h>


const std::vector<std::string> class_names = {
	"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
	"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
	"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
	"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
	"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
	"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
	"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
	"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
	"hair drier", "toothbrush" };			//类别名称

const int input_width = 640;
const int input_height = 640;
const float score_threshold = 0.2;
const float nms_threshold = 0.5;
const float confidence_threshold = 0.2;
const int input_numel = 1 * 3 * input_width * input_height;
const int num_classes = class_names.size();
const int output_numprob = 5 + num_classes;
const int output_numbox = 3 * (input_width / 8 * input_height / 8 + input_width / 16 * input_height / 16 + input_width / 32 * input_height / 32);
const int output_numel = 1 * output_numprob * output_numbox;


inline const char* severity_string(nvinfer1::ILogger::Severity t)
{
	switch (t)
	{
	case nvinfer1::ILogger::Severity::kINTERNAL_ERROR: return "internal_error";
	case nvinfer1::ILogger::Severity::kERROR:   return "error";
	case nvinfer1::ILogger::Severity::kWARNING: return "warning";
	case nvinfer1::ILogger::Severity::kINFO:    return "info";
	case nvinfer1::ILogger::Severity::kVERBOSE: return "verbose";
	default: return "unknow";
	}
}


class TRTLogger : public nvinfer1::ILogger
{
public:
	virtual void log(Severity severity, nvinfer1::AsciiChar const* msg) noexcept override
	{
		if (severity <= Severity::kINFO)
		{
			if (severity == Severity::kWARNING)
				printf("\033[33m%s: %s\033[0m\n", severity_string(severity), msg);
			else if (severity <= Severity::kERROR)
				printf("\033[31m%s: %s\033[0m\n", severity_string(severity), msg);
			else
				printf("%s: %s\n", severity_string(severity), msg);
		}
	}
} logger;


std::vector<unsigned char> load_file(const std::string & file) 
{
	std::ifstream in(file, std::ios::in | std::ios::binary);
	if (!in.is_open())
		return {};

	in.seekg(0, std::ios::end);
	size_t length = in.tellg();

	std::vector<uint8_t> data;
	if (length > 0) 
	{
		in.seekg(0, std::ios::beg);
		data.resize(length);
		in.read((char*)& data[0], length);
	}
	in.close();
	return data;
}


//LetterBox处理
void LetterBox(const cv::Mat& image, cv::Mat& outImage,
	const cv::Size& newShape = cv::Size(640, 640), const cv::Scalar& color = cv::Scalar(114, 114, 114))
{
	cv::Size shape = image.size();
	float r = std::min((float)newShape.height / (float)shape.height, (float)newShape.width / (float)shape.width);
	float ratio[2]{ r, r };
	int new_un_pad[2] = { (int)std::round((float)shape.width * r),(int)std::round((float)shape.height * r) };

	auto dw = (float)(newShape.width - new_un_pad[0]) / 2;
	auto dh = (float)(newShape.height - new_un_pad[1]) / 2;

	if (shape.width != new_un_pad[0] && shape.height != new_un_pad[1])
		cv::resize(image, outImage, cv::Size(new_un_pad[0], new_un_pad[1]));
	else
		outImage = image.clone();

	int top = int(std::round(dh - 0.1f));
	int bottom = int(std::round(dh + 0.1f));
	int left = int(std::round(dw - 0.1f));
	int right = int(std::round(dw + 0.1f));

	cv::Vec4d params;
	params[0] = ratio[0];
	params[1] = ratio[1];
	params[2] = left;
	params[3] = top;

	cv::copyMakeBorder(outImage, outImage, top, bottom, left, right, cv::BORDER_CONSTANT, color);
}


//预处理
void pre_process(cv::Mat& image, float* input_data_host)
{
	cv::Mat letterbox;
	LetterBox(image, letterbox, cv::Size(input_width, input_height));
	letterbox.convertTo(letterbox, CV_32FC3, 1.0f / 255.0f);

	int image_area = letterbox.cols * letterbox.rows;
	float* pimage = (float*)letterbox.data;
	float* phost_b = input_data_host + image_area * 0;
	float* phost_g = input_data_host + image_area * 1;
	float* phost_r = input_data_host + image_area * 2;
	for (int i = 0; i < image_area; ++i, pimage += 3)
	{
		*phost_r++ = pimage[0];
		*phost_g++ = pimage[1];
		*phost_b++ = pimage[2];
	}
	//std::vector<cv::Mat> split_images;
	//cv::split(letterbox, split_images);
	//for (size_t i = 0; i < letterbox.channels(); ++i)
	//{
	//	std::vector<float> split_image_data = split_images[i].reshape(1, 1);
	//	std::copy(split_image_data.begin(), split_image_data.end(), input_data_host +  i * split_image_data.size());
	//}
}


//网络推理
void process(std::string model, float* input_data_host, float* output_data_host)
{
	TRTLogger logger;
	auto engine_data = load_file(model);
	auto runtime = nvinfer1::createInferRuntime(logger);
	auto engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());

	cudaStream_t stream = nullptr;
	cudaStreamCreate(&stream);
	auto execution_context = engine->createExecutionContext();

	float* input_data_device = nullptr;
	cudaMalloc(&input_data_device, sizeof(float) * input_numel);
	cudaMemcpyAsync(input_data_device, input_data_host, sizeof(float) * input_numel, cudaMemcpyHostToDevice, stream);

	float* output_data_device = nullptr;
	cudaMalloc(&output_data_device, sizeof(float) * output_numel);

	float* bindings[] = { input_data_device, output_data_device };
	execution_context->enqueueV2((void**)bindings, stream, nullptr);
	cudaMemcpyAsync(output_data_host, output_data_device, sizeof(float) * output_numel, cudaMemcpyDeviceToHost, stream);
	cudaStreamSynchronize(stream);

	cudaStreamDestroy(stream);
	cudaFree(input_data_device);
	cudaFree(output_data_device);
}


//NMS
void nms(std::vector<cv::Rect>& boxes, std::vector<float>& scores, float score_threshold, float nms_threshold, std::vector<int>& indices)
{
	assert(boxes.size() == scores.size());

	struct BoxScore
	{
		cv::Rect box;
		float score;
		int id;
	};
	std::vector<BoxScore> boxes_scores;
	for (size_t i = 0; i < boxes.size(); i++)
	{
		BoxScore box_conf;
		box_conf.box = boxes[i];
		box_conf.score = scores[i];
		box_conf.id = i;
		if (scores[i] > score_threshold)	boxes_scores.push_back(box_conf);
	}

	std::sort(boxes_scores.begin(), boxes_scores.end(), [](BoxScore a, BoxScore b) { return a.score > b.score; });

	std::vector<float> area(boxes_scores.size());
	for (size_t i = 0; i < boxes_scores.size(); ++i)
	{
		area[i] = boxes_scores[i].box.width * boxes_scores[i].box.height;
	}

	std::vector<bool> isSuppressed(boxes_scores.size(), false);
	for (size_t i = 0; i < boxes_scores.size(); ++i)
	{
		if (isSuppressed[i])  continue;
		for (size_t j = i + 1; j < boxes_scores.size(); ++j)
		{
			if (isSuppressed[j])  continue;

			float x1 = (std::max)(boxes_scores[i].box.x, boxes_scores[j].box.x);
			float y1 = (std::max)(boxes_scores[i].box.y, boxes_scores[j].box.y);
			float x2 = (std::min)(boxes_scores[i].box.x + boxes_scores[i].box.width, boxes_scores[j].box.x + boxes_scores[j].box.width);
			float y2 = (std::min)(boxes_scores[i].box.y + boxes_scores[i].box.height, boxes_scores[j].box.y + boxes_scores[j].box.height);
			float w = (std::max)(0.0f, x2 - x1);
			float h = (std::max)(0.0f, y2 - y1);
			float inter = w * h;
			float ovr = inter / (area[i] + area[j] - inter);

			if (ovr >= nms_threshold)  isSuppressed[j] = true;
		}
	}

	for (int i = 0; i < boxes_scores.size(); ++i)
	{
		if (!isSuppressed[i])	indices.push_back(boxes_scores[i].id);
	}
}


//box缩放到原图尺寸
void scale_boxes(cv::Rect & box, cv::Size size)
{
	float gain = std::min(input_width * 1.0 / size.width, input_height * 1.0 / size.height);
	int pad_w = (input_width - size.width * gain) / 2;
	int pad_h = (input_height - size.height * gain) / 2;
	box.x -= pad_w;
	box.y -= pad_h;
	box.x /= gain;
	box.y /= gain;
	box.width /= gain;
	box.height /= gain;
}


//可视化函数
void draw_result(cv::Mat& image, std::string label, cv::Rect box)
{
	cv::rectangle(image, box, cv::Scalar(255, 0, 0), 1);
	int baseLine;
	cv::Size label_size = cv::getTextSize(label, 1, 1, 1, &baseLine);
	cv::Point tlc = cv::Point(box.x, box.y);
	cv::Point brc = cv::Point(box.x, box.y + label_size.height + baseLine);
	cv::putText(image, label, cv::Point(box.x, box.y), cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 0, 255), 1);
}


//后处理
cv::Mat post_process(cv::Mat & image, cv::Mat & result, float* output_data_host)
{
	std::vector<cv::Rect> boxes;
	std::vector<float> scores;
	std::vector<int> class_ids;

	for (int i = 0; i < output_numbox; ++i)//25200
	{
		float* ptr = output_data_host + i * output_numprob;//85
		float objness = ptr[4];
		if (objness < confidence_threshold)
			continue;

		float* classes_scores = 5 + ptr;
		int class_id = std::max_element(classes_scores, classes_scores + num_classes) - classes_scores;
		float max_class_score = classes_scores[class_id];
		float score = max_class_score * objness;
		if (score < score_threshold)
			continue;

		float x = ptr[0];
		float y = ptr[1];
		float w = ptr[2];
		float h = ptr[3];
		int left = int(x - 0.5 * w);
		int top = int(y - 0.5 * h);
		int width = int(w);
		int height = int(h);

		cv::Rect box = cv::Rect(left, top, width, height);
		scale_boxes(box, image.size());
		boxes.push_back(box);
		scores.push_back(score);
		class_ids.push_back(class_id);
	}

	std::vector<int> indices;
	nms(boxes, scores, score_threshold, nms_threshold, indices);
	for (int i = 0; i < indices.size(); i++)
	{
		int idx = indices[i];
		cv::Rect box = boxes[idx];
		std::string label = class_names[class_ids[idx]] + ":" + cv::format("%.2f", scores[idx]); //class_ids[idx]是class_id
		draw_result(result, label, box);
	}
}


int main(int argc, char* argv[])
{
	float* inputs = nullptr;
	float* outputs = nullptr;
	cudaMallocHost(&inputs, sizeof(float) * input_numel);
	cudaMallocHost(&outputs, sizeof(float) * output_numel);
	cv::Mat image = cv::imread("bus.jpg");
	pre_process(image, inputs);

	std::string model = "yolov5n_int8.engine";
	process(model, inputs, outputs);

	cv::Mat result = image.clone();
	post_process(image, result, outputs);

	cv::imwrite("result.jpg", result);
	cudaFreeHost(inputs);
	cudaFreeHost(outputs);
	return 0;
}

给算法爸爸上香

关注

4
点赞
踩
35

收藏

觉得还不错? 一键收藏
打赏
8
评论
yolov5量化部署（基于openvino和tensorrt）

下面的量化代码改编自：https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/111-yolov5-quantization-migration。然后，通过YOLOv5提供的export.py将预训练的Pytorch模型转换为OpenVINO FP32 IR模型。python和c++的推理代码都有。其中在我的机器上，fp32模型的yolov5n-yolov5l版本首次推理时间较长，不清楚是什么原因。
复制链接

扫一扫