3、picodet c++版onnxruntime推理及reshape和transpose的c++实现

牛andmore牛

已于 2024-03-11 15:07:26 修改

阅读量643

点赞数

分类专栏： # PaddleDetection ONNX 文章标签： opencv onnxruntime

于 2023-04-19 14:51:13 首次发布

本文链接：https://blog.csdn.net/u011119817/article/details/130244044

版权

ONNX 同时被 2 个专栏收录

8 篇文章 1 订阅

订阅专栏

PaddleDetection

5 篇文章 0 订阅

订阅专栏

文章介绍了如何在C++中使用ONNXRuntime进行PicoDet模型的完整推理，以及在模型裁剪后如何手动实现reshape和transpose操作。此外，还探讨了softmax函数的改进实现，以提高推理效率。

摘要由CSDN通过智能技术生成

文章目录

1、完整onnx c++推理
2、裁剪后模型的推理
- 2.1 分类reshape和transpose用python模拟c++
- 2.2 回归的reshape和transpose的python模拟
3、softmax改进

1、完整onnx c++推理

这里指的完整是指在用paddle export.py benchmark=True时的导出的模型，模型没有post和nms，推理方法可以直接参考：https://github.com/hpc203/picodet-onnxruntime

我做了一点小修改，代码如下：

#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <fstream>
#include <string>
#include <math.h>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
//#include <cuda_provider_factory.h>
#include <onnxruntime_cxx_api.h>

//using namespace cv;
//using namespace std;
//using namespace Ort;

typedef struct BoxInfo
{
	float x1;
	float y1;
	float x2;
	float y2;
	float score;
	int label;
} BoxInfo;

class PicoDet
{
public:
	PicoDet(std::string model_path, std::string classesFile, float nms_threshold, float objThreshold);
	void detect(cv::Mat& cv_image);
private:
	float score_threshold = 0.5;
	float nms_threshold = 0.5;
	std::vector<std::string> class_names;
	int num_class;

	cv::Mat resize_image(cv::Mat srcimg, int* newh, int* neww, int* top, int* left);
	std::vector<float> input_image_;
	void normalize_(cv::Mat img);
	void softmax_(const float* x, float* y, int length);
	void generate_proposal(std::vector<BoxInfo>& generate_boxes, const int stride_, const float* out_score, const float* out_box);
	void nms(std::vector<BoxInfo>& input_boxes);
	const bool keep_ratio = false;
	int inpWidth;
	int inpHeight;
	int num_outs;
	int reg_max;
	std::vector<int> stride;
	//const float mean[3] = { 103.53, 116.28, 123.675 };
	//const float stds[3] = { 57.375, 57.12, 58.395 };
	const float mean[3] = { 0.0, 0.0, 0.0 };
	const float stds[3] = { 255.0, 255.0, 255.0 };

	Ort::Env env = Ort::Env(ORT_LOGGING_LEVEL_ERROR, "picodet");
	Ort::Session* ort_session = nullptr;
	Ort::SessionOptions sessionOptions = Ort::SessionOptions();
	std::vector<char*> input_names;
	std::vector<char*> output_names;
	std::vector<std::vector<int64_t>> input_node_dims; // >=1 outputs
	std::vector<std::vector<int64_t>> output_node_dims; // >=1 outputs
};

PicoDet::PicoDet(std::string model_path, std::string classesFile, float nms_threshold, float objThreshold)
{
	std::ifstream ifs(classesFile.c_str());
	std::string line;
	while (std::getline(ifs, line)) this->class_names.push_back(line);
	this->num_class = class_names.size();
	this->nms_threshold = nms_threshold;
	this->score_threshold = objThreshold;

	std::wstring widestr = std::wstring(model_path.begin(), model_path.end());
	//OrtStatus* status = OrtSessionOptionsAppendExecutionProvider_CUDA(sessionOptions, 0);
	sessionOptions.SetGraphOptimizationLevel(ORT_ENABLE_BASIC);
	ort_session = new Ort::Session(env, widestr.c_str(), sessionOptions);
	size_t numInputNodes = ort_session->GetInputCount();
	size_t numOutputNodes = ort_session->GetOutputCount();
	Ort::AllocatorWithDefaultOptions allocator;
	for (int i = 0; i < numInputNodes; i++)
	{
		input_names.push_back(ort_session->GetInputName(i, allocator));
		Ort::TypeInfo input_type_info = ort_session->GetInputTypeInfo(i);
		auto input_tensor_info = input_type_info.GetTensorTypeAndShapeInfo();
		auto input_dims = input_tensor_info.GetShape();
		input_node_dims.push_back(input_dims);
	}
	for (int i = 0; i < numOutputNodes; i++)
	{
		output_names.push_back(ort_session->GetOutputName(i, allocator));
		Ort::TypeInfo output_type_info = ort_session->GetOutputTypeInfo(i);
		auto output_tensor_info = output_type_info.GetTensorTypeAndShapeInfo();
		auto output_dims = output_tensor_info.GetShape();
		output_node_dims.push_back(output_dims);
		/*for (int j = 0; j < output_dims.size(); j++)
		{
			cout << output_dims[j] << ",";
		}
		cout << endl;*/
	}
	this->inpHeight = input_node_dims[0][2];
	this->inpWidth = input_node_dims[0][3];
	this->num_outs = int(numOutputNodes * 0.5);
	this->reg_max = output_node_dims[this->num_outs][output_node_dims[this->num_outs].size() - 1] / 4 - 1;
	for (int i = 0; i < this->num_outs; i++)
	{
		stride.push_back(int(8 * pow(2, i)));
	}
}

cv::Mat PicoDet::resize_image(cv::Mat srcimg, int* newh, int* neww, int* top, int* left)
{
	int srch = srcimg.rows, srcw = srcimg.cols;
	*newh = this->inpHeight;
	*neww = this->inpWidth;
	cv::Mat dstimg;
	if (this->keep_ratio && srch != srcw) {
		float hw_scale = (float)srch / srcw;
		if (hw_scale > 1) {
			*newh = this->inpHeight;
			*neww = int(this->inpWidth / hw_scale);
			resize(srcimg, dstimg, cv::Size(*neww, *newh), cv::INTER_LINEAR);
			*left = int((this->inpWidth - *neww) * 0.5);
			copyMakeBorder(dstimg, dstimg, 0, 0, *left, this->inpWidth - *neww - *left, cv::BORDER_CONSTANT, 0);
		}
		else {
			*newh = (int)this->inpHeight * hw_scale;
			*neww = this->inpWidth;
			resize(srcimg, dstimg, cv::Size(*neww, *newh), cv::INTER_LINEAR); //插值方式要与模型训练相一致
			*top = (int)(this->inpHeight - *newh) * 0.5;
			copyMakeBorder(dstimg, dstimg, *top, this->inpHeight - *newh - *top, 0, 0, cv::BORDER_CONSTANT, 0);
		}
	}
	else {
		cv::resize(srcimg, dstimg, cv::Size(*neww, *newh), cv::INTER_LINEAR);
	}
	return dstimg;
}

void PicoDet::normalize_(cv::Mat img)
{
	//    img.convertTo(img, CV_32F);
	int row = img.rows;
	int col = img.cols;
	this->input_image_.resize(row * col * img.channels());
	for (int c = 0; c < 3; c++)
	{
		for (int i = 0; i < row; i++)
		{
			for (int j = 0; j < col; j++)
			{
				float pix = img.ptr<uchar>(i)[j * 3 + c];
				this->input_image_[c * row * col + i * col + j] = (pix / 255.0 - mean[c] / 255.0) / (stds[c] / 255.0);
				//this->input_image_[c * row * col + i * col + j] = (pix - mean[c]) / stds[c];
			}
		}
	}
}

void PicoDet::softmax_(const float* x, float* y, int length)
{
	float sum = 0;
	int i = 0;
	for (i = 0; i < length; i++)
	{
		y[i] = exp(x[i]);
		sum += y[i];
	}
	for (i = 0; i < length; i++)
	{
		y[i] /= sum;
	}
}

void PicoDet::generate_proposal(std::vector<BoxInfo>& generate_boxes, const int stride_, const float* out_score, const float* out_box)
{
	const int num_grid_y = (int)ceil((float)this->inpHeight / stride_);
	const int num_grid_x = (int)ceil((float)this->inpWidth / stride_);
	cout << "num_grid_x=" << num_grid_x << ",num_grid_y=" << num_grid_y << endl;
	const int reg_1max = reg_max + 1;
	//std::cout << "score:" << std::endl;
	for (int i = 0; i < num_grid_y; i++)
	{
		for (int j = 0; j < num_grid_x; j++)
		{
			int max_ind = 0;
			float max_score = 0;
			
			for (int k = 0; k < num_class; k++)
			{   
				/*这个代码是原始的输出*/
		    	float score = out_score[i * num_grid_x * num_class + j * num_class + k];
				/*以下代码是去掉reshape和transpose的，用C来实现这个功能的这两部分代码选一个即可,可以理解成ijk对应kij*/
				//float score = std::sqrt(out_score[k*num_grid_y*num_grid_x+i*num_grid_x+j]);
				//std::cout <<score << " ";
				if (score > max_score)
				{
					max_score = score;
					max_ind = k;
				}
			}
			if (max_score >= score_threshold)
			{
				std::cout << "box:" << std::endl;
				//const float* pbox = out_box + idx * reg_1max * 4;
				float dis_pred[4];
				float* y = new float[reg_1max];
				for (int k = 0; k < 4; k++)
				{
					/*原始模型*/
					const float* tmp = out_box + i * num_grid_x * reg_1max * 4 + j * reg_1max * 4 + k * reg_1max;
					//std::cout << "r:" << *tmp << std::endl;
					/*换用没有reshape transpose的*/
					//float* tmp = new float[reg_1max];
					//for (int m = 0; m < reg_1max; m++)
					//{
					//tmp[m] = out_box[k * num_grid_y * num_grid_x * reg_1max + i * num_grid_x + j + m * num_grid_y * num_grid_x];
					//}
					//std::cout << "r:" << *tmp << std::endl;
					softmax_(tmp, y, reg_1max);
					float dis = 0.f;
					for (int l = 0; l < reg_1max; l++)
					{
						dis += l * y[l];
					}
					dis_pred[k] = dis * stride_;
				}
				delete[] y;
				float pb_cx = (j + 0.5f) * stride_ - 0.5;
				float pb_cy = (i + 0.5f) * stride_ - 0.5;
				float x0 = pb_cx - dis_pred[0];
				float y0 = pb_cy - dis_pred[1];
				float x1 = pb_cx + dis_pred[2];
				float y1 = pb_cy + dis_pred[3];
				generate_boxes.push_back(BoxInfo{ x0, y0, x1, y1, max_score, max_ind });
			}
		}
	}
}

void PicoDet::nms(std::vector<BoxInfo>& input_boxes)
{
	sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) { return a.score > b.score; });
	std::vector<float> vArea(input_boxes.size());
	for (int i = 0; i < int(input_boxes.size()); ++i)
	{
		vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1)
			* (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
	}

	std::vector<bool> isSuppressed(input_boxes.size(), false);
	for (int i = 0; i < int(input_boxes.size()); ++i)
	{
		if (isSuppressed[i]) { continue; }
		for (int j = i + 1; j < int(input_boxes.size()); ++j)
		{
			if (isSuppressed[j]) { continue; }
			float xx1 = (std::max)(input_boxes[i].x1, input_boxes[j].x1);
			float yy1 = (std::max)(input_boxes[i].y1, input_boxes[j].y1);
			float xx2 = (std::min)(input_boxes[i].x2, input_boxes[j].x2);
			float yy2 = (std::min)(input_boxes[i].y2, input_boxes[j].y2);

			float w = (std::max)(float(0), xx2 - xx1 + 1);
			float h = (std::max)(float(0), yy2 - yy1 + 1);
			float inter = w * h;
			float ovr = inter / (vArea[i] + vArea[j] - inter);

			if (ovr >= this->nms_threshold)
			{
				isSuppressed[j] = true;
			}
		}
	}
	// return post_nms;
	int idx_t = 0;
	input_boxes.erase(remove_if(input_boxes.begin(), input_boxes.end(), [&idx_t, &isSuppressed](const BoxInfo& f) { return isSuppressed[idx_t++]; }), input_boxes.end());
}

void PicoDet::detect(cv::Mat& srcimg)
{
	int newh = 0, neww = 0, top = 0, left = 0;
	cv::Mat cv_image = srcimg.clone();
	cv::Mat dst = this->resize_image(cv_image, &newh, &neww, &top, &left);
	this->normalize_(dst);
	std::array<int64_t, 4> input_shape_{ 1, 3, this->inpHeight, this->inpWidth };

	auto allocator_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
	Ort::Value input_tensor_ = Ort::Value::CreateTensor<float>(allocator_info, input_image_.data(), input_image_.size(), input_shape_.data(), input_shape_.size());


	std::vector<Ort::Value> ort_outputs = ort_session->Run(Ort::RunOptions{ nullptr }, &input_names[0], &input_tensor_, 1, output_names.data(), output_names.size());   // ???????
	/generate proposals
	std::vector<BoxInfo> generate_boxes;
	for (int i = 0; i < this->num_outs; i++)
	{
		//auto cls_shape = this->output_node_dims[i];
		const float* cls_score = ort_outputs[i].GetTensorMutableData<float>();
		//std::vector<int64_t> new_cls_shape = { cls_shape[0],cls_shape[1],cls_shape[2] * cls_shape[3] };
		


		const float* bbox_pred = ort_outputs[i + this->num_outs].GetTensorMutableData<float>();
		//auto reg_shape = this->output_node_dims[i+this->num_outs];
		generate_proposal(generate_boxes, stride[i], cls_score, bbox_pred);
	}

	 Perform non maximum suppression to eliminate redundant overlapping boxes with
	 lower confidences
	nms(generate_boxes);
	float ratioh = (float)cv_image.rows / newh;
	float ratiow = (float)cv_image.cols / neww;
	for (size_t i = 0; i < generate_boxes.size(); ++i)
	{
		int xmin = (int)std::max((generate_boxes[i].x1 - left) * ratiow, 0.f);
		int ymin = (int)std::max((generate_boxes[i].y1 - top) * ratioh, 0.f);
		int xmax = (int)std::min((generate_boxes[i].x2 - left) * ratiow, (float)cv_image.cols);
		int ymax = (int)std::min((generate_boxes[i].y2 - top) * ratioh, (float)cv_image.rows);
		rectangle(srcimg, cv::Point(xmin, ymin), cv::Point(xmax, ymax), cv::Scalar(0, 0, 255), 2);
		std::string label = cv::format("%.2f", generate_boxes[i].score);
		label = this->class_names[generate_boxes[i].label] + ":" + label;
		putText(srcimg, label, cv::Point(xmin, ymin - 5), cv::FONT_HERSHEY_SIMPLEX, 0.75, cv::Scalar(0, 255, 0), 1);
	}
}

int main()
{
	PicoDet mynet("picodet_xs_320_voc_256_20230405_shape.onnx", "ball.names", 0.5, 0.5);  /// choice = ["picodet_m_320_coco.onnx", "picodet_m_416_coco.onnx", "picodet_s_320_coco.onnx", "picodet_s_416_coco.onnx"]
	//PicoDet mynet("Cpicodet_xs_320_voc_256_20230405_shape_sim_prune.onnx", "ball.names", 0.5, 0.5);
	std::string imgpath = "test.jpg";
    cv::Mat bgrimg = cv::imread(imgpath,cv::IMREAD_COLOR);
	cv::Mat rgbimg;
	cv::cvtColor(bgrimg,rgbimg,cv::COLOR_BGR2RGB);
	mynet.detect(rgbimg);
	cv::Mat resultimg;
	cv::cvtColor(rgbimg, resultimg, cv::COLOR_RGB2BGR);
	cv::imwrite("test_result.jpg", resultimg);
	static const std::string kWinName = "Deep learning object detection in ONNXRuntime";
	cv::namedWindow(kWinName, cv::WINDOW_NORMAL);
	cv::imshow(kWinName, resultimg);
	cv::waitKey(0);
	cv::destroyAllWindows();
}

2、裁剪后模型的推理

这里主要是把reshape和两个算子给去掉了，需要用代码来实现

这部分只要把裁剪的部分加下即可，我们用的是直接从原始模型上进行裁剪的，与原始模型相比，差的部分就是：
请添加图片描述

所以把这部分加上就可以了
对于这种reshape和transpose很多的加速芯片是无法使用的，要用cpu完成，所以把这些算子单独取出来看效果。

2.1 分类reshape和transpose用python模拟c++

如图，有两个头，分别是分类和位置回归。对于onnxruntime的输出形状为1xcxkxk ->reshape->1xcx(kk)->transpose->1x(kk)c;
对于位置回归头 1x32xkxk->reshape->1x32x(kk)->transpose->1x(k*k)x32

k是每个检测头最后的大小，如输入是256，每个头的stride分别是[8,16,32,64],那么应有k就是[32,16,8,4],直接c++实现,有些搞不明白，先用python来实现。
我们选用k为4来做实验，类别数为2,batchsize为1。python要模拟c++,按照内存分布来说python要flatten，在内存中连续分布。

import numpy as np
num_grid_x=4 #宽
num_grid_y=4 #高
num_cls=2 #分类
a = np.arange(num_cls*num_grid_x*num_grid_y).reshape(num_cls,num_grid_y,num_grid_x) #裁剪后模型输出c,k,k
aa=a.flatten()
b = a.transpose(1,2,0) # reshape transpose
bb = b.flatten()
#以上部分a就是裁剪后的模型，b就是完整的，我们想用正常的访问b的方法来实现对a的访问，从而实现reshape和transpose 
for i in range(num_grid_y):
    for j in range(num_grid_x):
        for k in range(num_cls): 
            print(b[i,j,k],"  ",a[k,i,j])

可发看到结果是相同的，那么如果输出不是正方形，会怎么样，看下边代码：

num_grid_x=4 #宽
num_grid_y=3 #高
num_cls=2 #分类
a = np.arange(num_cls*num_grid_x*num_grid_y).reshape(num_cls,num_grid_y,num_grid_x) #裁剪后模型输出c,k,k
aa=a.flatten()
b = a.transpose(1,2,0) # reshape transpose
bb = b.flatten()
#以上部分a就是裁剪后的模型，b就是完整的，我们想用正常的访问b的方法来实现对a的访问，从而实现reshape和transpose 
for i in range(num_grid_y):
    for j in range(num_grid_x):
        for k in range(num_cls): 
            print(b[i,j,k],"  ",a[k,i,j])

同样是没有问题的，也就是说可以得出结论：kxkxc 与cxkxk索引对应关系是 i，j,k对应k,i,j,也可以理解成kxkxc变成cxkxk是transpose 0,1,2变成1，2，0，ijk变成kij
接差用c语言的思路来实现

num_grid_x=4 #宽
num_grid_y=4 #高
num_cls=2 #分类
a = np.arange(num_cls*num_grid_x*num_grid_y).reshape(num_cls,num_grid_y,num_grid_x) #裁剪后模型输出c,k,k
aa=a.flatten()
b = a.transpose(1,2,0) # reshape transpose
bb = b.flatten()
#以上部分a就是裁剪后的模型，b就是完整的，我们想用正常的访问b的方法来实现对a的访问，从而实现reshape和transpose 
for i in range(num_grid_y):
    for j in range(num_grid_x):
        for k in range(num_cls): 
            tb=i*num_grid_x*num_cls+j*num_cls+k
            ta=k*num_grid_y*num_grid_x+i*num_grid_x+j
            print(bb[tb]," ",aa[ta])

关于c代码，可以查看我上边的PicoDet::generate_proposal里代码

2.2 回归的reshape和transpose的python模拟

前边32是固定的

num_grid_x=4
num_grid_y=4

a = np.arange(4*8*num_grid_x*num_grid_y).reshape(32,num_grid_y,num_grid_x).reshape(4,8,num_grid_x,num_grid_y)# a是裁剪后的返回结果 32xkxk与4x8xkxk在内存中是差别不大的
b = a.transpose(2,3,0,1) 
for i in range(num_grid_y):
    for j in range(num_grid_x):
        for k in range(4):
            print(b[i,j,k],"  ",a[k,:,i,j])

[  0  16  32  48  64  80  96 112]    [  0  16  32  48  64  80  96 112]
[128 144 160 176 192 208 224 240]    [128 144 160 176 192 208 224 240]
[256 272 288 304 320 336 352 368]    [256 272 288 304 320 336 352 368]
[384 400 416 432 448 464 480 496]    [384 400 416 432 448 464 480 496]
[  1  17  33  49  65  81  97 113]    [  1  17  33  49  65  81  97 113]
[129 145 161 177 193 209 225 241]    [129 145 161 177 193 209 225 241]
[257 273 289 305 321 337 353 369]    [257 273 289 305 321 337 353 369]
[385 401 417 433 449 465 481 497]    [385 401 417 433 449 465 481 497]
[  2  18  34  50  66  82  98 114]    [  2  18  34  50  66  82  98 114]
[130 146 162 178 194 210 226 242]    [130 146 162 178 194 210 226 242]
[258 274 290 306 322 338 354 370]    [258 274 290 306 322 338 354 370]
[386 402 418 434 450 466 482 498]    [386 402 418 434 450 466 482 498]
[  3  19  35  51  67  83  99 115]    [  3  19  35  51  67  83  99 115]
[131 147 163 179 195 211 227 243]    [131 147 163 179 195 211 227 243]
[259 275 291 307 323 339 355 371]    [259 275 291 307 323 339 355 371]
[387 403 419 435 451 467 483 499]    [387 403 419 435 451 467 483 499]
[  4  20  36  52  68  84 100 116]    [  4  20  36  52  68  84 100 116]
[132 148 164 180 196 212 228 244]    [132 148 164 180 196 212 228 244]
[260 276 292 308 324 340 356 372]    [260 276 292 308 324 340 356 372]
[388 404 420 436 452 468 484 500]    [388 404 420 436 452 468 484 500]
[  5  21  37  53  69  85 101 117]    [  5  21  37  53  69  85 101 117]
[133 149 165 181 197 213 229 245]    [133 149 165 181 197 213 229 245]
[261 277 293 309 325 341 357 373]    [261 277 293 309 325 341 357 373]
[389 405 421 437 453 469 485 501]    [389 405 421 437 453 469 485 501]
[  6  22  38  54  70  86 102 118]    [  6  22  38  54  70  86 102 118]
[134 150 166 182 198 214 230 246]    [134 150 166 182 198 214 230 246]
[262 278 294 310 326 342 358 374]    [262 278 294 310 326 342 358 374]
[390 406 422 438 454 470 486 502]    [390 406 422 438 454 470 486 502]
[  7  23  39  55  71  87 103 119]    [  7  23  39  55  71  87 103 119]
[135 151 167 183 199 215 231 247]    [135 151 167 183 199 215 231 247]
[263 279 295 311 327 343 359 375]    [263 279 295 311 327 343 359 375]
[391 407 423 439 455 471 487 503]    [391 407 423 439 455 471 487 503]
[  8  24  40  56  72  88 104 120]    [  8  24  40  56  72  88 104 120]
[136 152 168 184 200 216 232 248]    [136 152 168 184 200 216 232 248]
[264 280 296 312 328 344 360 376]    [264 280 296 312 328 344 360 376]
[392 408 424 440 456 472 488 504]    [392 408 424 440 456 472 488 504]
[  9  25  41  57  73  89 105 121]    [  9  25  41  57  73  89 105 121]
[137 153 169 185 201 217 233 249]    [137 153 169 185 201 217 233 249]
[265 281 297 313 329 345 361 377]    [265 281 297 313 329 345 361 377]
[393 409 425 441 457 473 489 505]    [393 409 425 441 457 473 489 505]
[ 10  26  42  58  74  90 106 122]    [ 10  26  42  58  74  90 106 122]
[138 154 170 186 202 218 234 250]    [138 154 170 186 202 218 234 250]
[266 282 298 314 330 346 362 378]    [266 282 298 314 330 346 362 378]
[394 410 426 442 458 474 490 506]    [394 410 426 442 458 474 490 506]
[ 11  27  43  59  75  91 107 123]    [ 11  27  43  59  75  91 107 123]
[139 155 171 187 203 219 235 251]    [139 155 171 187 203 219 235 251]
[267 283 299 315 331 347 363 379]    [267 283 299 315 331 347 363 379]
[395 411 427 443 459 475 491 507]    [395 411 427 443 459 475 491 507]
[ 12  28  44  60  76  92 108 124]    [ 12  28  44  60  76  92 108 124]
[140 156 172 188 204 220 236 252]    [140 156 172 188 204 220 236 252]
[268 284 300 316 332 348 364 380]    [268 284 300 316 332 348 364 380]
[396 412 428 444 460 476 492 508]    [396 412 428 444 460 476 492 508]
[ 13  29  45  61  77  93 109 125]    [ 13  29  45  61  77  93 109 125]
[141 157 173 189 205 221 237 253]    [141 157 173 189 205 221 237 253]
[269 285 301 317 333 349 365 381]    [269 285 301 317 333 349 365 381]
[397 413 429 445 461 477 493 509]    [397 413 429 445 461 477 493 509]
[ 14  30  46  62  78  94 110 126]    [ 14  30  46  62  78  94 110 126]
[142 158 174 190 206 222 238 254]    [142 158 174 190 206 222 238 254]
[270 286 302 318 334 350 366 382]    [270 286 302 318 334 350 366 382]
[398 414 430 446 462 478 494 510]    [398 414 430 446 462 478 494 510]
[ 15  31  47  63  79  95 111 127]    [ 15  31  47  63  79  95 111 127]
[143 159 175 191 207 223 239 255]    [143 159 175 191 207 223 239 255]
[271 287 303 319 335 351 367 383]    [271 287 303 319 335 351 367 383]
[399 415 431 447 463 479 495 511]    [399 415 431 447 463 479 495 511]

上边代码，因为是四维的，但只用三维的来操作，只是为了做到32变成4*8，获取4份连续的8个数（8个可能的位置），换成c代码来看下边两份代码：

num_grid_x=4
num_grid_y=4
a = np.arange(4*8*num_grid_x*num_grid_y).reshape(32,num_grid_y,num_grid_x).reshape(4,8,num_grid_x,num_grid_y)
aa = a.flatten()
b = a.transpose(2,3,0,1)
bb = b.flatten()
for i in range(num_grid_y):
    for j in range(num_grid_x):
        for k in range(4):
            #print(b[i,j,k],"  ",a[k,:,i,j])
            t1 =i*num_grid_x*32+j*32+k*8
            t2 =k*8*num_grid_x*num_grid_y+i*num_grid_x+j
            print(f"{bb[t1:t1+8]} {aa[t2:t2+8]}")

[  0  16  32  48  64  80  96 112] [0 1 2 3 4 5 6 7]
[128 144 160 176 192 208 224 240] [128 129 130 131 132 133 134 135]
[256 272 288 304 320 336 352 368] [256 257 258 259 260 261 262 263]
[384 400 416 432 448 464 480 496] [384 385 386 387 388 389 390 391]
[  1  17  33  49  65  81  97 113] [1 2 3 4 5 6 7 8]
[129 145 161 177 193 209 225 241] [129 130 131 132 133 134 135 136]
[257 273 289 305 321 337 353 369] [257 258 259 260 261 262 263 264]
[385 401 417 433 449 465 481 497] [385 386 387 388 389 390 391 392]
[  2  18  34  50  66  82  98 114] [2 3 4 5 6 7 8 9]
[130 146 162 178 194 210 226 242] [130 131 132 133 134 135 136 137]
[258 274 290 306 322 338 354 370] [258 259 260 261 262 263 264 265]
[386 402 418 434 450 466 482 498] [386 387 388 389 390 391 392 393]
[  3  19  35  51  67  83  99 115] [ 3  4  5  6  7  8  9 10]
[131 147 163 179 195 211 227 243] [131 132 133 134 135 136 137 138]
[259 275 291 307 323 339 355 371] [259 260 261 262 263 264 265 266]
[387 403 419 435 451 467 483 499] [387 388 389 390 391 392 393 394]
[  4  20  36  52  68  84 100 116] [ 4  5  6  7  8  9 10 11]
[132 148 164 180 196 212 228 244] [132 133 134 135 136 137 138 139]
[260 276 292 308 324 340 356 372] [260 261 262 263 264 265 266 267]
[388 404 420 436 452 468 484 500] [388 389 390 391 392 393 394 395]
[  5  21  37  53  69  85 101 117] [ 5  6  7  8  9 10 11 12]
[133 149 165 181 197 213 229 245] [133 134 135 136 137 138 139 140]
[261 277 293 309 325 341 357 373] [261 262 263 264 265 266 267 268]
[389 405 421 437 453 469 485 501] [389 390 391 392 393 394 395 396]
[  6  22  38  54  70  86 102 118] [ 6  7  8  9 10 11 12 13]
[134 150 166 182 198 214 230 246] [134 135 136 137 138 139 140 141]
[262 278 294 310 326 342 358 374] [262 263 264 265 266 267 268 269]
[390 406 422 438 454 470 486 502] [390 391 392 393 394 395 396 397]
[  7  23  39  55  71  87 103 119] [ 7  8  9 10 11 12 13 14]
[135 151 167 183 199 215 231 247] [135 136 137 138 139 140 141 142]
[263 279 295 311 327 343 359 375] [263 264 265 266 267 268 269 270]
[391 407 423 439 455 471 487 503] [391 392 393 394 395 396 397 398]
[  8  24  40  56  72  88 104 120] [ 8  9 10 11 12 13 14 15]
[136 152 168 184 200 216 232 248] [136 137 138 139 140 141 142 143]
[264 280 296 312 328 344 360 376] [264 265 266 267 268 269 270 271]
[392 408 424 440 456 472 488 504] [392 393 394 395 396 397 398 399]
[  9  25  41  57  73  89 105 121] [ 9 10 11 12 13 14 15 16]
[137 153 169 185 201 217 233 249] [137 138 139 140 141 142 143 144]
[265 281 297 313 329 345 361 377] [265 266 267 268 269 270 271 272]
[393 409 425 441 457 473 489 505] [393 394 395 396 397 398 399 400]
[ 10  26  42  58  74  90 106 122] [10 11 12 13 14 15 16 17]
[138 154 170 186 202 218 234 250] [138 139 140 141 142 143 144 145]
[266 282 298 314 330 346 362 378] [266 267 268 269 270 271 272 273]
[394 410 426 442 458 474 490 506] [394 395 396 397 398 399 400 401]
[ 11  27  43  59  75  91 107 123] [11 12 13 14 15 16 17 18]
[139 155 171 187 203 219 235 251] [139 140 141 142 143 144 145 146]
[267 283 299 315 331 347 363 379] [267 268 269 270 271 272 273 274]
[395 411 427 443 459 475 491 507] [395 396 397 398 399 400 401 402]
[ 12  28  44  60  76  92 108 124] [12 13 14 15 16 17 18 19]
[140 156 172 188 204 220 236 252] [140 141 142 143 144 145 146 147]
[268 284 300 316 332 348 364 380] [268 269 270 271 272 273 274 275]
[396 412 428 444 460 476 492 508] [396 397 398 399 400 401 402 403]
[ 13  29  45  61  77  93 109 125] [13 14 15 16 17 18 19 20]
[141 157 173 189 205 221 237 253] [141 142 143 144 145 146 147 148]
[269 285 301 317 333 349 365 381] [269 270 271 272 273 274 275 276]
[397 413 429 445 461 477 493 509] [397 398 399 400 401 402 403 404]
[ 14  30  46  62  78  94 110 126] [14 15 16 17 18 19 20 21]
[142 158 174 190 206 222 238 254] [142 143 144 145 146 147 148 149]
[270 286 302 318 334 350 366 382] [270 271 272 273 274 275 276 277]
[398 414 430 446 462 478 494 510] [398 399 400 401 402 403 404 405]
[ 15  31  47  63  79  95 111 127] [15 16 17 18 19 20 21 22]
[143 159 175 191 207 223 239 255] [143 144 145 146 147 148 149 150]
[271 287 303 319 335 351 367 383] [271 272 273 274 275 276 277 278]
[399 415 431 447 463 479 495 511] [399 400 401 402 403 404 405 406]

num_grid_x=4
num_grid_y=4
a = np.arange(4*8*num_grid_x*num_grid_y).reshape(32,num_grid_y,num_grid_x).reshape(4,8,num_grid_x,num_grid_y)
aa = a.flatten()
b = a.transpose(2,3,0,1) # num_grid_y num_grid_x 4 8
bb = b.flatten()
for i in range(num_grid_y):
    for j in range(num_grid_x):
        for k in range(4):
            #print(b[i,j,k],"  ",a[k,:,i,j])
            t1 =i*num_grid_x*32+j*32+k*8
            t2 =k*8*num_grid_x*num_grid_y+i*num_grid_x+j
            print(f"{bb[t1:t1+8]} {aa[t2:t2+8*16:16]}")

[  0  16  32  48  64  80  96 112] [  0  16  32  48  64  80  96 112]
[128 144 160 176 192 208 224 240] [128 144 160 176 192 208 224 240]
[256 272 288 304 320 336 352 368] [256 272 288 304 320 336 352 368]
[384 400 416 432 448 464 480 496] [384 400 416 432 448 464 480 496]
[  1  17  33  49  65  81  97 113] [  1  17  33  49  65  81  97 113]
[129 145 161 177 193 209 225 241] [129 145 161 177 193 209 225 241]
[257 273 289 305 321 337 353 369] [257 273 289 305 321 337 353 369]
[385 401 417 433 449 465 481 497] [385 401 417 433 449 465 481 497]
[  2  18  34  50  66  82  98 114] [  2  18  34  50  66  82  98 114]
[130 146 162 178 194 210 226 242] [130 146 162 178 194 210 226 242]
[258 274 290 306 322 338 354 370] [258 274 290 306 322 338 354 370]
[386 402 418 434 450 466 482 498] [386 402 418 434 450 466 482 498]
[  3  19  35  51  67  83  99 115] [  3  19  35  51  67  83  99 115]
[131 147 163 179 195 211 227 243] [131 147 163 179 195 211 227 243]
[259 275 291 307 323 339 355 371] [259 275 291 307 323 339 355 371]
[387 403 419 435 451 467 483 499] [387 403 419 435 451 467 483 499]
[  4  20  36  52  68  84 100 116] [  4  20  36  52  68  84 100 116]
[132 148 164 180 196 212 228 244] [132 148 164 180 196 212 228 244]
[260 276 292 308 324 340 356 372] [260 276 292 308 324 340 356 372]
[388 404 420 436 452 468 484 500] [388 404 420 436 452 468 484 500]
[  5  21  37  53  69  85 101 117] [  5  21  37  53  69  85 101 117]
[133 149 165 181 197 213 229 245] [133 149 165 181 197 213 229 245]
[261 277 293 309 325 341 357 373] [261 277 293 309 325 341 357 373]
[389 405 421 437 453 469 485 501] [389 405 421 437 453 469 485 501]
[  6  22  38  54  70  86 102 118] [  6  22  38  54  70  86 102 118]
[134 150 166 182 198 214 230 246] [134 150 166 182 198 214 230 246]
[262 278 294 310 326 342 358 374] [262 278 294 310 326 342 358 374]
[390 406 422 438 454 470 486 502] [390 406 422 438 454 470 486 502]
[  7  23  39  55  71  87 103 119] [  7  23  39  55  71  87 103 119]
[135 151 167 183 199 215 231 247] [135 151 167 183 199 215 231 247]
[263 279 295 311 327 343 359 375] [263 279 295 311 327 343 359 375]
[391 407 423 439 455 471 487 503] [391 407 423 439 455 471 487 503]
[  8  24  40  56  72  88 104 120] [  8  24  40  56  72  88 104 120]
[136 152 168 184 200 216 232 248] [136 152 168 184 200 216 232 248]
[264 280 296 312 328 344 360 376] [264 280 296 312 328 344 360 376]
[392 408 424 440 456 472 488 504] [392 408 424 440 456 472 488 504]
[  9  25  41  57  73  89 105 121] [  9  25  41  57  73  89 105 121]
[137 153 169 185 201 217 233 249] [137 153 169 185 201 217 233 249]
[265 281 297 313 329 345 361 377] [265 281 297 313 329 345 361 377]
[393 409 425 441 457 473 489 505] [393 409 425 441 457 473 489 505]
[ 10  26  42  58  74  90 106 122] [ 10  26  42  58  74  90 106 122]
[138 154 170 186 202 218 234 250] [138 154 170 186 202 218 234 250]
[266 282 298 314 330 346 362 378] [266 282 298 314 330 346 362 378]
[394 410 426 442 458 474 490 506] [394 410 426 442 458 474 490 506]
[ 11  27  43  59  75  91 107 123] [ 11  27  43  59  75  91 107 123]
[139 155 171 187 203 219 235 251] [139 155 171 187 203 219 235 251]
[267 283 299 315 331 347 363 379] [267 283 299 315 331 347 363 379]
[395 411 427 443 459 475 491 507] [395 411 427 443 459 475 491 507]
[ 12  28  44  60  76  92 108 124] [ 12  28  44  60  76  92 108 124]
[140 156 172 188 204 220 236 252] [140 156 172 188 204 220 236 252]
[268 284 300 316 332 348 364 380] [268 284 300 316 332 348 364 380]
[396 412 428 444 460 476 492 508] [396 412 428 444 460 476 492 508]
[ 13  29  45  61  77  93 109 125] [ 13  29  45  61  77  93 109 125]
[141 157 173 189 205 221 237 253] [141 157 173 189 205 221 237 253]
[269 285 301 317 333 349 365 381] [269 285 301 317 333 349 365 381]
[397 413 429 445 461 477 493 509] [397 413 429 445 461 477 493 509]
[ 14  30  46  62  78  94 110 126] [ 14  30  46  62  78  94 110 126]
[142 158 174 190 206 222 238 254] [142 158 174 190 206 222 238 254]
[270 286 302 318 334 350 366 382] [270 286 302 318 334 350 366 382]
[398 414 430 446 462 478 494 510] [398 414 430 446 462 478 494 510]
[ 15  31  47  63  79  95 111 127] [ 15  31  47  63  79  95 111 127]
[143 159 175 191 207 223 239 255] [143 159 175 191 207 223 239 255]
[271 287 303 319 335 351 367 383] [271 287 303 319 335 351 367 383]
[399 415 431 447 463 479 495 511] [399 415 431 447 463 479 495 511]

所以要用c代码的话：

//reg_1max=8,tmp用来存放连续的8个数
float* tmp = new float[reg_1max]; 
for (int m = 0; m < reg_1max; m++)
{
tmp[m] = out_box[k * reg_1max* num_grid_y * num_grid_x  + i * num_grid_x + j + m * num_grid_y * num_grid_x];
}

理解一下，kxkx4x8对应的是ik48+j48+k8,这是8个数的首地址，接差连取8个即可
4x8xkxk对应的是python: k,:,i,j，冒号就是全取，c对应的是：k8ij+ik+j+mkk ，这个m就是冒号这里的位置，取m=0,1,2,3,4,5,6,7,分别表示这一维度下的8个。

3、softmax改进

换了一个softmax的实现方式

#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <fstream>
#include <string>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
//#include <cuda_provider_factory.h>
#include <onnxruntime_cxx_api.h>

//using namespace cv;
//using namespace std;
//using namespace Ort;

typedef struct BoxInfo
{
	float x1;
	float y1;
	float x2;
	float y2;
	float score;
	int label;
} BoxInfo;

class PicoDet
{
public:
	PicoDet(std::string model_path, std::string classesFile, float nms_threshold, float objThreshold);
	void detect(cv::Mat& cv_image);
private:
	float score_threshold = 0.5;
	float nms_threshold = 0.5;
	std::vector<std::string> class_names;
	int num_class;

	cv::Mat resize_image(cv::Mat srcimg, int* newh, int* neww, int* top, int* left);
	std::vector<float> input_image_;
	void normalize_(cv::Mat img);
	inline float fast_exp(float x);
	template <typename _Tp>
	int activation_function_softmax(const _Tp* src, _Tp* dst, int length);
	//void softmax_(const float* x, float* y, int length);
	void generate_proposal(std::vector<BoxInfo>& generate_boxes, const int stride_, const float* out_score, const float* out_box);
	void nms(std::vector<BoxInfo>& input_boxes);
	const bool keep_ratio = false;
	int inpWidth;
	int inpHeight;
	int num_outs;
	int reg_max;
	std::vector<int> stride;
	//const float mean[3] = { 103.53, 116.28, 123.675 };
	//const float stds[3] = { 57.375, 57.12, 58.395 };
	const float mean[3] = { 0.0, 0.0, 0.0 };
	const float stds[3] = { 255.0, 255.0, 255.0 };

	Ort::Env env = Ort::Env(ORT_LOGGING_LEVEL_ERROR, "picodet");
	Ort::Session* ort_session = nullptr;
	Ort::SessionOptions sessionOptions = Ort::SessionOptions();
	std::vector<char*> input_names;
	std::vector<char*> output_names;
	std::vector<std::vector<int64_t>> input_node_dims; // >=1 outputs
	std::vector<std::vector<int64_t>> output_node_dims; // >=1 outputs
};
inline float PicoDet::fast_exp(float x) {
	union {
		uint32_t i;
		float f;
	} v{};
	v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
	return v.f;
}

template <typename _Tp>
int PicoDet::activation_function_softmax(const _Tp* src, _Tp* dst, int length) {
	const _Tp alpha = *std::max_element(src, src + length);
	_Tp denominator{ 0 };

	for (int i = 0; i < length; ++i) {
		dst[i] = fast_exp(src[i] - alpha);
		denominator += dst[i];
	}

	for (int i = 0; i < length; ++i) {
		dst[i] /= denominator;
	}

	return 0;
}
PicoDet::PicoDet(std::string model_path, std::string classesFile, float nms_threshold, float objThreshold)
{
	std::ifstream ifs(classesFile.c_str());
	std::string line;
	while (std::getline(ifs, line)) this->class_names.push_back(line);
	this->num_class = class_names.size();
	this->nms_threshold = nms_threshold;
	this->score_threshold = objThreshold;

	std::wstring widestr = std::wstring(model_path.begin(), model_path.end());
	//OrtStatus* status = OrtSessionOptionsAppendExecutionProvider_CUDA(sessionOptions, 0);
	sessionOptions.SetGraphOptimizationLevel(ORT_ENABLE_BASIC);
	ort_session = new Ort::Session(env, widestr.c_str(), sessionOptions);
	size_t numInputNodes = ort_session->GetInputCount();
	size_t numOutputNodes = ort_session->GetOutputCount();
	Ort::AllocatorWithDefaultOptions allocator;
	for (int i = 0; i < numInputNodes; i++)
	{
		input_names.push_back(ort_session->GetInputName(i, allocator));
		Ort::TypeInfo input_type_info = ort_session->GetInputTypeInfo(i);
		auto input_tensor_info = input_type_info.GetTensorTypeAndShapeInfo();
		auto input_dims = input_tensor_info.GetShape();
		input_node_dims.push_back(input_dims);
	}
	for (int i = 0; i < numOutputNodes; i++)
	{
		output_names.push_back(ort_session->GetOutputName(i, allocator));
		Ort::TypeInfo output_type_info = ort_session->GetOutputTypeInfo(i);
		auto output_tensor_info = output_type_info.GetTensorTypeAndShapeInfo();
		auto output_dims = output_tensor_info.GetShape();
		output_node_dims.push_back(output_dims);
		/*for (int j = 0; j < output_dims.size(); j++)
		{
			cout << output_dims[j] << ",";
		}
		cout << endl;*/
	}
	this->inpHeight = input_node_dims[0][2];
	this->inpWidth = input_node_dims[0][3];
	this->num_outs = int(numOutputNodes * 0.5);
	this->reg_max = output_node_dims[this->num_outs][output_node_dims[this->num_outs].size() - 1] / 4 - 1;
	for (int i = 0; i < this->num_outs; i++)
	{
		stride.push_back(int(8 * pow(2, i)));
	}
}

cv::Mat PicoDet::resize_image(cv::Mat srcimg, int* newh, int* neww, int* top, int* left)
{
	int srch = srcimg.rows, srcw = srcimg.cols;
	*newh = this->inpHeight;
	*neww = this->inpWidth;
	cv::Mat dstimg;
	if (this->keep_ratio && srch != srcw) {
		float hw_scale = (float)srch / srcw;
		if (hw_scale > 1) {
			*newh = this->inpHeight;
			*neww = int(this->inpWidth / hw_scale);
			resize(srcimg, dstimg, cv::Size(*neww, *newh), cv::INTER_AREA);
			*left = int((this->inpWidth - *neww) * 0.5);
			copyMakeBorder(dstimg, dstimg, 0, 0, *left, this->inpWidth - *neww - *left, cv::BORDER_CONSTANT, 0);
		}
		else {
			*newh = (int)this->inpHeight * hw_scale;
			*neww = this->inpWidth;
			resize(srcimg, dstimg, cv::Size(*neww, *newh), cv::INTER_AREA);
			*top = (int)(this->inpHeight - *newh) * 0.5;
			copyMakeBorder(dstimg, dstimg, *top, this->inpHeight - *newh - *top, 0, 0, cv::BORDER_CONSTANT, 0);
		}
	}
	else {
		cv::resize(srcimg, dstimg, cv::Size(*neww, *newh), cv::INTER_AREA);
	}
	return dstimg;
}

void PicoDet::normalize_(cv::Mat img)
{
	//    img.convertTo(img, CV_32F);
	int row = img.rows;
	int col = img.cols;
	this->input_image_.resize(row * col * img.channels());
	for (int c = 0; c < 3; c++)
	{
		for (int i = 0; i < row; i++)
		{
			for (int j = 0; j < col; j++)
			{
				float pix = img.ptr<uchar>(i)[j * 3 + c];
				this->input_image_[c * row * col + i * col + j] = (pix / 255.0 - mean[c] / 255.0) / (stds[c] / 255.0);
				//this->input_image_[c * row * col + i * col + j] = (pix - mean[c]) / stds[c];
			}
		}
	}
}
/*
void PicoDet::softmax_(const float* x, float* y, int length)
{
	float sum = 0;
	int i = 0;
	for (i = 0; i < length; i++)
	{
		y[i] = exp(x[i]);
		sum += y[i];
	}
	for (i = 0; i < length; i++)
	{
		y[i] /= sum;
	}
}
*/



void PicoDet::generate_proposal(std::vector<BoxInfo>& generate_boxes, const int stride_, const float* out_score, const float* out_box)
{
	const int num_grid_y = (int)ceil((float)this->inpHeight / stride_);
	const int num_grid_x = (int)ceil((float)this->inpWidth / stride_);
	cout << "num_grid_x=" << num_grid_x << ",num_grid_y=" << num_grid_y << endl;
	const int reg_1max = reg_max + 1;
	//std::cout << "score:" << std::endl;
	for (int i = 0; i < num_grid_y; i++)
	{
		for (int j = 0; j < num_grid_x; j++)
		{
			int max_ind = 0;
			float max_score = 0;
			
			for (int k = 0; k < num_class; k++)
			{   
				/*这个代码是原始的输出*/
		    	//float score = out_score[i * num_grid_x * num_class + j * num_class + k];
				/*以下代码是去掉reshape和transpose的，用C来实现这个功能的这两部分代码选一个即可,可以理解成ijk对应kij*/
				float score = std::sqrt(out_score[k*num_grid_y*num_grid_x+i*num_grid_x+j]);
				//std::cout <<score << " ";
				if (score > max_score)
				{
					max_score = score;
					max_ind = k;
				}
			}
			if (max_score >= score_threshold)
			{
				std::cout << "box:" << std::endl;
				//const float* pbox = out_box + idx * reg_1max * 4;
				float dis_pred[4];
				float* y = new float[reg_1max];
				for (int k = 0; k < 4; k++)
				{
					/*原始模型*/
					//const float* tmp = out_box + i * num_grid_x * reg_1max * 4 + j * reg_1max * 4 + k * reg_1max;
					//std::cout << "r:" << *tmp << std::endl;
					/*换用没有reshape transpose的*/
					float* tmp = new float[reg_1max];
					for (int m = 0; m < reg_1max; m++)
					{
						tmp[m] = out_box[k * num_grid_y * num_grid_x * reg_1max + i * num_grid_x + j + m * num_grid_y * num_grid_x];
					}
					//std::cout << "r:" << *tmp << std::endl;
					//softmax_(tmp, y, reg_1max);
					activation_function_softmax(tmp, y, reg_1max);
					float dis = 0.f;
					for (int l = 0; l < reg_1max; l++)
					{
						dis += l * y[l];
					}
					dis_pred[k] = dis * stride_;
				}
				delete[] y;
				float pb_cx = (j + 0.5f) * stride_ - 0.5;
				float pb_cy = (i + 0.5f) * stride_ - 0.5;
				float x0 = pb_cx - dis_pred[0];
				float y0 = pb_cy - dis_pred[1];
				float x1 = pb_cx + dis_pred[2];
				float y1 = pb_cy + dis_pred[3];
				generate_boxes.push_back(BoxInfo{ x0, y0, x1, y1, max_score, max_ind });
			}
		}
	}
}

void PicoDet::nms(std::vector<BoxInfo>& input_boxes)
{
	sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) { return a.score > b.score; });
	std::vector<float> vArea(input_boxes.size());
	for (int i = 0; i < int(input_boxes.size()); ++i)
	{
		vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1)
			* (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
	}

	std::vector<bool> isSuppressed(input_boxes.size(), false);
	for (int i = 0; i < int(input_boxes.size()); ++i)
	{
		if (isSuppressed[i]) { continue; }
		for (int j = i + 1; j < int(input_boxes.size()); ++j)
		{
			if (isSuppressed[j]) { continue; }
			float xx1 = (std::max)(input_boxes[i].x1, input_boxes[j].x1);
			float yy1 = (std::max)(input_boxes[i].y1, input_boxes[j].y1);
			float xx2 = (std::min)(input_boxes[i].x2, input_boxes[j].x2);
			float yy2 = (std::min)(input_boxes[i].y2, input_boxes[j].y2);

			float w = (std::max)(float(0), xx2 - xx1 + 1);
			float h = (std::max)(float(0), yy2 - yy1 + 1);
			float inter = w * h;
			float ovr = inter / (vArea[i] + vArea[j] - inter);

			if (ovr >= this->nms_threshold)
			{
				isSuppressed[j] = true;
			}
		}
	}
	// return post_nms;
	int idx_t = 0;
	input_boxes.erase(remove_if(input_boxes.begin(), input_boxes.end(), [&idx_t, &isSuppressed](const BoxInfo& f) { return isSuppressed[idx_t++]; }), input_boxes.end());
}

void PicoDet::detect(cv::Mat& srcimg)
{
	int newh = 0, neww = 0, top = 0, left = 0;
	cv::Mat cv_image = srcimg.clone();
	cv::Mat dst = this->resize_image(cv_image, &newh, &neww, &top, &left);
	this->normalize_(dst);
	std::array<int64_t, 4> input_shape_{ 1, 3, this->inpHeight, this->inpWidth };

	auto allocator_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
	Ort::Value input_tensor_ = Ort::Value::CreateTensor<float>(allocator_info, input_image_.data(), input_image_.size(), input_shape_.data(), input_shape_.size());


	std::vector<Ort::Value> ort_outputs = ort_session->Run(Ort::RunOptions{ nullptr }, &input_names[0], &input_tensor_, 1, output_names.data(), output_names.size());   // ???????
	/generate proposals
	std::vector<BoxInfo> generate_boxes;
	for (int i = 0; i < this->num_outs; i++)
	{
		//auto cls_shape = this->output_node_dims[i];
		const float* cls_score = ort_outputs[i].GetTensorMutableData<float>();
		//std::vector<int64_t> new_cls_shape = { cls_shape[0],cls_shape[1],cls_shape[2] * cls_shape[3] };
		


		const float* bbox_pred = ort_outputs[i + this->num_outs].GetTensorMutableData<float>();
		//auto reg_shape = this->output_node_dims[i+this->num_outs];
		generate_proposal(generate_boxes, stride[i], cls_score, bbox_pred);
	}

	 Perform non maximum suppression to eliminate redundant overlapping boxes with
	 lower confidences
	nms(generate_boxes);
	float ratioh = (float)cv_image.rows / newh;
	float ratiow = (float)cv_image.cols / neww;
	for (size_t i = 0; i < generate_boxes.size(); ++i)
	{
		int xmin = (int)std::max((generate_boxes[i].x1 - left) * ratiow, 0.f);
		int ymin = (int)std::max((generate_boxes[i].y1 - top) * ratioh, 0.f);
		int xmax = (int)std::min((generate_boxes[i].x2 - left) * ratiow, (float)cv_image.cols);
		int ymax = (int)std::min((generate_boxes[i].y2 - top) * ratioh, (float)cv_image.rows);
		rectangle(srcimg, cv::Point(xmin, ymin), cv::Point(xmax, ymax), cv::Scalar(0, 0, 255), 2);
		std::string label = cv::format("%.2f", generate_boxes[i].score);
		label = this->class_names[generate_boxes[i].label] + ":" + label;
		putText(srcimg, label, cv::Point(xmin, ymin - 5), cv::FONT_HERSHEY_SIMPLEX, 0.75, cv::Scalar(0, 255, 0), 1);
	}
}

int main()
{
	//PicoDet mynet("C:/Users/tl/Desktop/demo_ncnn/ncnn_our/picodet_xs_320_voc_256_20230405_shape.onnx", "C:/Users/tl/Desktop/demo_ncnn/ncnn_our/ball.names", 0.5, 0.5);  /// choice = ["picodet_m_320_coco.onnx", "picodet_m_416_coco.onnx", "picodet_s_320_coco.onnx", "picodet_s_416_coco.onnx"]
	PicoDet mynet("C:/Users/tl/Desktop/demo_ncnn/ncnn_our/picodet_xs_320_voc_256_20230405_shape_sim_prune.onnx", "C:/Users/tl/Desktop/demo_ncnn/ncnn_our/ball.names", 0.5, 0.5);
	std::string imgpath = "C:/Users/tl/Desktop/demo_ncnn/ncnn_our/test.jpg";
	cv::Mat srcimg = cv::imread(imgpath);
	mynet.detect(srcimg);
	cv::imwrite("C:/Users/tl/Desktop/demo_ncnn/ncnn_our/test_result.jpg", srcimg);
	static const std::string kWinName = "Deep learning object detection in ONNXRuntime";
	cv::namedWindow(kWinName, cv::WINDOW_NORMAL);
	cv::imshow(kWinName, srcimg);
	cv::waitKey(0);
	cv::destroyAllWindows();
}

牛andmore牛

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
3、picodet c++版onnxruntime推理及reshape和transpose的c++实现

k是每个检测头最后的大小，如输入是256，每个头的stride分别是[8,16,32,64],那么应有k就是[32,16,8,4],直接c++实现,有些搞不明白，先用python来实现。k ，这个m就是冒号这里的位置，取m=0,1,2,3,4,5,6,7,分别表示这一维度下的8个。关于c代码，可以查看我上边的PicoDet::generate_proposal里代码。4x8xkxk对应的是python: k,:,i,j，冒号就是全取，c对应的是：k。8,这是8个数的首地址，接差连取8个即可。
复制链接

扫一扫

专栏目录