【YOLOV5 libtorch】C++加载torchscript模型 GPU 推理 -window10

46 篇文章 4 订阅
1 篇文章 0 订阅
#include <windows.h>
#include <opencv2/opencv.hpp>
#include <torch/script.h>
#include <algorithm>
#include <iostream>
#include <time.h>
#include <libloaderapi.h>


//#include <libloaderapi.h>

std::vector<torch::Tensor> non_max_suppression(torch::Tensor preds, float score_thresh = 0.5, float iou_thresh = 0.5)
{
	std::vector<torch::Tensor> output;
	for (size_t i = 0; i < preds.sizes()[0]; ++i)
	{
		torch::Tensor pred = preds.select(0, i);

		// Filter by scores
		torch::Tensor scores = pred.select(1, 4) * std::get<0>(torch::max(pred.slice(1, 5, pred.sizes()[1]), 1));
		pred = torch::index_select(pred, 0, torch::nonzero(scores > score_thresh).select(1, 0));
		if (pred.sizes()[0] == 0) continue;

		// (center_x, center_y, w, h) to (left, top, right, bottom)
		pred.select(1, 0) = pred.select(1, 0) - pred.select(1, 2) / 2;
		pred.select(1, 1) = pred.select(1, 1) - pred.select(1, 3) / 2;
		pred.select(1, 2) = pred.select(1, 0) + pred.select(1, 2);
		pred.select(1, 3) = pred.select(1, 1) + pred.select(1, 3);

		// Computing scores and classes
		std::tuple<torch::Tensor, torch::Tensor> max_tuple = torch::max(pred.slice(1, 5, pred.sizes()[1]), 1);
		pred.select(1, 4) = pred.select(1, 4) * std::get<0>(max_tuple);
		pred.select(1, 5) = std::get<1>(max_tuple);

		torch::Tensor  dets = pred.slice(1, 0, 6);

		torch::Tensor keep = torch::empty({ dets.sizes()[0] });
		torch::Tensor areas = (dets.select(1, 3) - dets.select(1, 1)) * (dets.select(1, 2) - dets.select(1, 0));
		std::tuple<torch::Tensor, torch::Tensor> indexes_tuple = torch::sort(dets.select(1, 4), 0, 1);
		torch::Tensor v = std::get<0>(indexes_tuple);
		torch::Tensor indexes = std::get<1>(indexes_tuple);
		int count = 0;
		while (indexes.sizes()[0] > 0)
		{
			keep[count] = (indexes[0].item().toInt());
			count += 1;

			// Computing overlaps
			torch::Tensor lefts = torch::empty(indexes.sizes()[0] - 1);
			torch::Tensor tops = torch::empty(indexes.sizes()[0] - 1);
			torch::Tensor rights = torch::empty(indexes.sizes()[0] - 1);
			torch::Tensor bottoms = torch::empty(indexes.sizes()[0] - 1);
			torch::Tensor widths = torch::empty(indexes.sizes()[0] - 1);
			torch::Tensor heights = torch::empty(indexes.sizes()[0] - 1);
			for (size_t i = 0; i < indexes.sizes()[0] - 1; ++i)
			{
				lefts[i] = std::max(dets[indexes[0]][0].item().toFloat(), dets[indexes[i + 1]][0].item().toFloat());
				tops[i] = std::max(dets[indexes[0]][1].item().toFloat(), dets[indexes[i + 1]][1].item().toFloat());
				rights[i] = std::min(dets[indexes[0]][2].item().toFloat(), dets[indexes[i + 1]][2].item().toFloat());
				bottoms[i] = std::min(dets[indexes[0]][3].item().toFloat(), dets[indexes[i + 1]][3].item().toFloat());
				widths[i] = std::max(float(0), rights[i].item().toFloat() - lefts[i].item().toFloat());
				heights[i] = std::max(float(0), bottoms[i].item().toFloat() - tops[i].item().toFloat());
			}
			torch::Tensor overlaps = widths * heights;

			// FIlter by IOUs
			torch::Tensor ious = overlaps / (areas.select(0, indexes[0].item().toInt()) + torch::index_select(areas, 0, indexes.slice(0, 1, indexes.sizes()[0])) - overlaps);
			indexes = torch::index_select(indexes, 0, torch::nonzero(ious <= iou_thresh).select(1, 0) + 1);
		}
		keep = keep.toType(torch::kInt64);
		output.push_back(torch::index_select(dets, 0, keep.slice(0, 0, count)));
	}
	return output;
}


int main()
{
    cout << "cuda是否可用:" << torch::cuda::is_available() << "\t显卡数量:" << torch::cuda::device_count() << endl;
	cout << "cudnn是否可用:" << torch::cuda::cudnn_is_available() << endl;
	// Loading  Module  D:\Software\vs2019+pcl+opencv\libtorch\lib
	torch::jit::script::Module module;
	//auto device = at::kCUDA;
	try
	{
		LoadLibraryA("ATen_cuda.dll");
		LoadLibraryA("c10_cuda.dll");
		LoadLibraryA("torch_cuda.dll");
		LoadLibraryA("torchvision.dll");
		module = torch::jit::load("weights/best.torchscript");

		module.to(torch::Device(torch::kCUDA));//
	}
	catch (const std::exception& e)
	{
		std::cout << e.what();
	}



	std::vector<std::string> classnames;
	std::ifstream f("weights/block.txt");
	std::string name = "";
	while (std::getline(f, name))
	{
		classnames.push_back(name);
	}

	cv::VideoCapture cap = cv::VideoCapture("1.mp4");
	//cap.set(cv::CAP_PROP_FRAME_WIDTH, 1920);
	//cap.set(cv::CAP_PROP_FRAME_HEIGHT, 1080);
	cv::Mat frame, img;
	while (cap.isOpened())
	{
		clock_t start = clock();
		cap.read(frame);
		if (frame.empty())
		{
			std::cout << "Read frame failed!" << std::endl;
			break;
		}

		// Preparing input tensor
		cv::resize(frame, img, cv::Size(416, 416));
		cv::cvtColor(img, img, cv::COLOR_BGR2RGB);
		//img.convertTo(img, CV_32FC3, 1.0f / 255.0f);

		//torch::Tensor imgTensor = torch::from_blob(img.data, { img.rows, img.cols,3 }, at::kByte);
		imgTensor = imgTensor.permute({ 0, 3, 1, 2 }).contiguous();
		//imgTensor = imgTensor.permute({ 2,0,1 });
		//imgTensor = imgTensor.toType(torch::kFloat);
		//imgTensor = imgTensor.div(255);
		//imgTensor = imgTensor.unsqueeze(0);

		torch::Tensor ten_img = torch::from_blob(img.data, { 1, img.rows, img.cols, 3 }, torch::kByte).to(torch::Device(torch::kCUDA));
		ten_img = ten_img.permute({ 0, 3, 1, 2 });
		ten_img = ten_img.toType(torch::kFloat);
		ten_img = ten_img.div(255);
		//int h = ten_img.sizes()[2], w = ten_img.sizes()[3];
		// preds: [?, 15120, 9]
		torch::Tensor preds;
		//try
		{//
			preds = module.forward({ ten_img }).toTuple()->elements()[0].toTensor();//libtorch cu113 应该与cuda toolkit 11.1 版本一致。

		}
		//catch (const std::exception& e)
		//{
		//	std::cout << e.what();
		//}
		std::vector<torch::Tensor> dets = non_max_suppression(preds.to(at::kCPU), 0.4, 0.5);//OK
		if (dets.size() > 0)
		{
			// Visualize result
			for (size_t i = 0; i < dets[0].sizes()[0]; ++i)
			{
				float left = dets[0][i][0].item().toFloat() * frame.cols / 416;
				float top = dets[0][i][1].item().toFloat() * frame.rows / 416;
				float right = dets[0][i][2].item().toFloat() * frame.cols / 416;
				float bottom = dets[0][i][3].item().toFloat() * frame.rows / 416;
				float score = dets[0][i][4].item().toFloat();
				int classID = dets[0][i][5].item().toInt();

				cv::rectangle(frame, cv::Rect(left, top, (right - left), (bottom - top)), cv::Scalar(0, 255, 0), 2);

				cv::putText(frame,
					classnames[classID] + ": " + cv::format("%.2f", score),
					cv::Point(left, top),
					cv::FONT_HERSHEY_SIMPLEX, (right - left) / 200, cv::Scalar(0, 255, 0), 2);
			}
		}
		cv::putText(frame, "FPS: " + std::to_string(int(1e7 / (clock() - start))),
			cv::Point(50, 50),
			cv::FONT_HERSHEY_SIMPLEX, 1, cv::Scalar(0, 255, 0), 2);
		cv::imshow("", frame);
		if (cv::waitKey(1) == 27) break;
	}
	return 0;
}
python  export.py --weights best.pt --img 416 --batch 1

export.py

def parse_opt():
    parser = argparse.ArgumentParser()
    parser.add_argument('--data', type=str, default=ROOT / 'data/block.yaml', help='dataset.yaml path')
    parser.add_argument('--weights', nargs='+', type=str, default=ROOT / 'best.pt', help='model.pt path(s)')
    parser.add_argument('--imgsz', '--img', '--img-size', nargs='+', type=int, default=[416, 416], help='image (h, w)')
    parser.add_argument('--batch-size', type=int, default=1, help='batch size')
    parser.add_argument('--device', default='0', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
    parser.add_argument('--half', action='store_true', help='FP16 half-precision export')
    parser.add_argument('--inplace', action='store_true', help='set YOLOv5 Detect() inplace=True')
    parser.add_argument('--train', action='store_true', help='model.train() mode')
    parser.add_argument('--keras', action='store_true', help='TF: use Keras')
    parser.add_argument('--optimize', action='store_true', help='TorchScript: optimize for mobile')
    parser.add_argument('--int8', action='store_true', help='CoreML/TF INT8 quantization')
    parser.add_argument('--dynamic', action='store_true', help='ONNX/TF: dynamic axes')
    parser.add_argument('--simplify', action='store_true', help='ONNX: simplify model')
    parser.add_argument('--opset', type=int, default=12, help='ONNX: opset version')
    parser.add_argument('--verbose', action='store_true', help='TensorRT: verbose log')
    parser.add_argument('--workspace', type=int, default=4, help='TensorRT: workspace size (GB)')
    parser.add_argument('--nms', action='store_true', help='TF: add NMS to model')
    parser.add_argument('--agnostic-nms', action='store_true', help='TF: add agnostic NMS to model')
    parser.add_argument('--topk-per-class', type=int, default=100, help='TF.js NMS: topk per class to keep')
    parser.add_argument('--topk-all', type=int, default=100, help='TF.js NMS: topk for all classes to keep')
    parser.add_argument('--iou-thres', type=float, default=0.45, help='TF.js NMS: IoU threshold')
    parser.add_argument('--conf-thres', type=float, default=0.25, help='TF.js NMS: confidence threshold')
    parser.add_argument('--include',
                        nargs='+',
                        default=['torchscript', 'onnx'],
                        help='torchscript, onnx, openvino, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs')
    opt = parser.parse_args()
    print_args(vars(opt))
    return opt

pt模型导出 onnx 

D:\test\Train\yolov5 [master ↓30 +10 ~4 -0 !]> python  export.py --weights best.pt --img 416 --batch 16
export: data=D:\test\Train\yolov5\data\coco128.yaml, weights=['best.pt'], imgsz=[416], batch_size=16, device=cpu, half=False, inplace=False,
 train=False, keras=False, optimize=False, int8=False, dynamic=False, simplify=False, opset=12, verbose=False, workspace=4, nms=False, agnos
tic_nms=False, topk_per_class=100, topk_all=100, iou_thres=0.45, conf_thres=0.25, include=['torchscript', 'onnx']
YOLOv5  v6.1-223-g1dcb774 Python-3.9.13 torch-1.9.0+cu111 CPU

Fusing layers...
YOLOv5s summary: 213 layers, 7012822 parameters, 0 gradients, 15.8 GFLOPs

PyTorch: starting from best.pt with output shape (16, 10647, 6) (13.7 MB)

TorchScript: starting export with torch 1.9.0+cu111...
TorchScript: export success, saved as best.torchscript (27.1 MB)

ONNX: starting export with onnx 1.10.2...
ONNX: export success, saved as best.onnx (26.9 MB)

Export complete (15.79s)
Results saved to D:\test\Train\yolov5
Detect:          python detect.py --weights best.onnx
PyTorch Hub:     model = torch.hub.load('ultralytics/yolov5', 'custom', 'best.onnx')
Validate:        python val.py --weights best.onnx
Visualize:       https://netron.app

 附录 libtorch下载


libtorch 1.10.0
release
# cpu
https://download.pytorch.org/libtorch/cpu/libtorch-win-shared-with-deps-1.10.0%2Bcpu.zip
# cuda
https://download.pytorch.org/libtorch/cu102/libtorch-win-shared-with-deps-1.10.0%2Bcu102.zip
https://download.pytorch.org/libtorch/cu113/libtorch-win-shared-with-deps-1.10.0%2Bcu113.zip
 
debug
# cpu
https://download.pytorch.org/libtorch/cpu/libtorch-win-shared-with-deps-debug-1.10.0%2Bcpu.zip
# cuda
https://download.pytorch.org/libtorch/cu102/libtorch-win-shared-with-deps-debug-1.10.0%2Bcu102.zip
https://download.pytorch.org/libtorch/cu113/libtorch-win-shared-with-deps-debug-1.10.0%2Bcu113.zip




libtorch 1.9.1
release
# cpu
https://download.pytorch.org/libtorch/cpu/libtorch-win-shared-with-deps-1.9.1%2Bcpu.zip
# cuda
https://download.pytorch.org/libtorch/cu102/libtorch-win-shared-with-deps-1.9.1%2Bcu102.zip
https://download.pytorch.org/libtorch/cu111/libtorch-win-shared-with-deps-1.9.1%2Bcu111.zip

debug
# cpu
https://download.pytorch.org/libtorch/cpu/libtorch-win-shared-with-deps-debug-1.9.1%2Bcpu.zip
# cuda
https://download.pytorch.org/libtorch/cu102/libtorch-win-shared-with-deps-debug-1.9.1%2Bcu102.zip
https://download.pytorch.org/libtorch/cu111/libtorch-win-shared-with-deps-debug-1.9.1%2Bcu111.zip





libtorch 1.8.2 (LTS)
release
# cpu
https://download.pytorch.org/libtorch/lts/1.8/cpu/libtorch-win-shared-with-deps-1.8.2%2Bcpu.zip
# cuda
https://download.pytorch.org/libtorch/lts/1.8/cu102/libtorch-win-shared-with-deps-1.8.2%2Bcu102.zip
https://download.pytorch.org/libtorch/lts/1.8/cu111/libtorch-win-shared-with-deps-1.8.2%2Bcu111.zip

debug
# cpu
https://download.pytorch.org/libtorch/lts/1.8/cpu/libtorch-win-shared-with-deps-debug-1.8.2%2Bcpu.zip
# cuda
https://download.pytorch.org/libtorch/lts/1.8/cu102/libtorch-win-shared-with-deps-debug-1.8.2%2Bcu102.zip
https://download.pytorch.org/libtorch/lts/1.8/cu111/libtorch-win-shared-with-deps-debug-1.8.2%2Bcu111.zip

trian.py opt

def parse_opt(known=False):
    parser = argparse.ArgumentParser()
    parser.add_argument('--weights', type=str, default=ROOT / 'yolov5s.pt', help='initial weights path')
    parser.add_argument('--cfg', type=str, default='models/yolov5s.yaml', help='model.yaml path')
    parser.add_argument('--data', type=str, default=ROOT / './data/block.yaml', help='dataset.yaml path')
    parser.add_argument('--hyp', type=str, default=ROOT / 'data/hyps/hyp.scratch-low.yaml', help='hyperparameters path')
    parser.add_argument('--epochs', type=int, default=300)
    parser.add_argument('--batch-size', type=int, default=16, help='total batch size for all GPUs, -1 for autobatch')#16
    parser.add_argument('--imgsz', '--img', '--img-size', type=int, default=416, help='train, val image size (pixels)')#640
    parser.add_argument('--rect', action='store_true', help='rectangular training')
    parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training')
    parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
    parser.add_argument('--noval', action='store_true', help='only validate final epoch')
    parser.add_argument('--noautoanchor', action='store_true', help='disable AutoAnchor')
    parser.add_argument('--noplots', action='store_true', help='save no plot files')
    parser.add_argument('--evolve', type=int, nargs='?', const=300, help='evolve hyperparameters for x generations')
    parser.add_argument('--bucket', type=str, default='', help='gsutil bucket')
    parser.add_argument('--cache', type=str, nargs='?', const='ram', help='--cache images in "ram" (default) or "disk"')
    parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training')
    parser.add_argument('--device', default='0', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
    parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%')
    parser.add_argument('--single-cls', action='store_true', help='train multi-class data as single-class')
    parser.add_argument('--optimizer', type=str, choices=['SGD', 'Adam', 'AdamW'], default='SGD', help='optimizer')
    parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode')
    parser.add_argument('--workers', type=int, default=5, help='max dataloader workers (per RANK in DDP mode)')#8 error   __init__() missing 1 required positional argument: 'dtype'
    parser.add_argument('--project', default=ROOT / 'runs/train', help='save to project/name')
    parser.add_argument('--name', default='exp', help='save to project/name')
    parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
    parser.add_argument('--quad', action='store_true', help='quad dataloader')
    parser.add_argument('--cos-lr', action='store_true', help='cosine LR scheduler')
    parser.add_argument('--label-smoothing', type=float, default=0.0, help='Label smoothing epsilon')
    parser.add_argument('--patience', type=int, default=100, help='EarlyStopping patience (epochs without improvement)')
    parser.add_argument('--freeze', nargs='+', type=int, default=[0], help='Freeze layers: backbone=10, first3=0 1 2')
    parser.add_argument('--save-period', type=int, default=-1, help='Save checkpoint every x epochs (disabled if < 1)')
    parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify')

    # Weights & Biases arguments
    parser.add_argument('--entity', default=None, help='W&B: Entity')
    parser.add_argument('--upload_dataset', nargs='?', const=True, default=False, help='W&B: Upload data, "val" option')
    parser.add_argument('--bbox_interval', type=int, default=-1, help='W&B: Set bounding-box image logging interval')
    parser.add_argument('--artifact_alias', type=str, default='latest', help='W&B: Version of dataset artifact to use')

    opt = parser.parse_known_args()[0] if known else parser.parse_args()
    return opt

  • 0
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
注:本文将以 YOLOv5 为例,介绍如何使用 LibTorchTorch TensorRT 对 TorchScript 模型进行加速推理。本文默认读者已经熟悉 YOLOv5TorchScript 的相关知识。 1. 准备工作 在开始之前,需要先安装以下工具: - PyTorch - LibTorch - Torch TensorRT 其中,PyTorch 是用于训练 YOLOv5 模型的框架,而 LibTorchTorch TensorRT 则是用于加速推理的工具。在安装完这些工具之后,需要将训练好的 YOLOv5 模型转换为 TorchScript 格式。 2. 将 YOLOv5 模型转换为 TorchScript 格式 将训练好的 YOLOv5 模型转换为 TorchScript 格式的方法有很多种,这里给出一种比较简单的方法: ```python import torch from models.experimental import attempt_load from utils.general import set_logging from torch.utils.mobile_optimizer import optimize_for_mobile def export_torchscript(weights, img_size, device='cpu'): set_logging() model = attempt_load(weights, map_location=device) img = torch.zeros((1, 3, img_size, img_size), device=device) model.eval() traced_script_module = torch.jit.trace(model, img) traced_script_module_optimized = optimize_for_mobile(traced_script_module) traced_script_module_optimized.save("yolov5s.torchscript.pt") export_torchscript(weights='yolov5s.pt', img_size=640, device='cpu') ``` 在这个函数中,我们首先加载训练好的 YOLOv5 模型,然后使用 torch.jit.trace 将模型转换为 TorchScript 格式。接着,我们使用 torch.utils.mobile_optimizer.optimize_for_mobile 对模型进行优化,最后将优化后的模型保存到磁盘上。 3. 加载 TorchScript 模型C++加载 TorchScript 模型需要使用 LibTorch,下面是加载模型代码: ```cpp #include <torch/script.h> // One-stop header. int main(int argc, const char* argv[]) { // Load the model. torch::jit::script::Module module; try { // Deserialize the ScriptModule from a file using torch::jit::load(). module = torch::jit::load("yolov5s.torchscript.pt"); } catch (const c10::Error& e) { std::cerr << "error loading the model\n"; return -1; } return 0; } ``` 在这个代码中,我们使用 torch::jit::load 函数加载 TorchScript 模型。如果加载失败,将输出错误信息并返回 -1,否则返回 0。 4. 使用 Torch TensorRT 进行推理 为了加速 TorchScript 模型推理,我们可以使用 Torch TensorRT。下面是使用 Torch TensorRT 进行推理代码: ```cpp #include <torch/script.h> // One-stop header. #include <iostream> #include <memory> #include <vector> #include <chrono> #include <NvInferRuntime.h> int main(int argc, const char* argv[]) { // Load the model. torch::jit::script::Module module; try { // Deserialize the ScriptModule from a file using torch::jit::load(). module = torch::jit::load("yolov5s.torchscript.pt"); } catch (const c10::Error& e) { std::cerr << "error loading the model\n"; return -1; } // Create a TensorRT engine from the TorchScript module. nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(gLogger); nvinfer1::ICudaEngine* engine = createCudaEngine(module, runtime, batchSize, kINPUT_BLOB_NAME, kOUTPUT_BLOB_NAME, maxWorkspaceSize); if (!engine) { std::cerr << "error creating the engine\n"; return -1; } // Create a TensorRT execution context. nvinfer1::IExecutionContext* context = engine->createExecutionContext(); if (!context) { std::cerr << "error creating the context\n"; return -1; } // Prepare inputs and outputs. std::vector<float> inputData(batchSize * inputSize * inputSize * 3); std::vector<float> outputData(batchSize * outputSize * outputSize * (5 + numClasses)); void* buffers[] = {inputData.data(), outputData.data()}; // Run inference. auto start = std::chrono::high_resolution_clock::now(); context->execute(batchSize, buffers); auto end = std::chrono::high_resolution_clock::now(); std::chrono::duration<double, std::milli> elapsed = end - start; std::cout << "elapsed time: " << elapsed.count() << " ms\n"; // Release resources. context->destroy(); engine->destroy(); runtime->destroy(); return 0; } ``` 在这个代码中,我们首先使用 createCudaEngine 函数将 TorchScript 模型转换为 TensorRT engine。接着,我们创建 TensorRT execution context,准备输入和输出数据,并调用 context->execute 进行推理。最后,我们释放资源。 5. 总结 本文介绍了如何使用 LibTorchTorch TensorRT 对 TorchScript 模型进行加速推理。在实际应用中,我们可以根据自己的需求对代码进行修改和优化,以达到更好的性能和效果。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值