OpenCV----YOLACT实例分割模型推理

题目要求:了解opencv的DNN库,给定一张自然场景图片,使用训练好的yolact模型,进行目标检测结果输出。

分析:
1)opencv的DNN模块集成了很多深度学习模型,包括人脸检测、图像分类、分割、目标检测等,集成了Pytorch、tensorflow、paddlepaddle等模型框架(参看代码库OpenCV/dnn
2)深度学习推理模型一般步骤:加载模型,包括配置文件和权重文件;输入图像预处理,转换成模型可接受的文件类型和尺寸;模型预测后处理,对于实例分割,主要是NMS后处理方法和分割,YOLACT已自动完成这些后处理过程;

  • 结果展示:
    在这里插入图片描述
main.exe -h
#######
Usage: main.exe [params] image confThreshold nmsThresshold 

        -?, -h, --help, --usage (value:true)
                opecv based deep learining demo

        image
                Image to process
        confThreshold (value:0.5)
                confidence threshold, default 0.5
        nmsThresshold (value:0.5)
                nms threshold, default 0.3
 main.exe example_01.jpg
  • 代码示例:
// main.cpp //
#include <fstream>
#include <iostream>
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include "yolact.cpp"

using namespace cv;
using namespace dnn;
using namespace std;

bool parseParam(int argc, char** argv, const char* keys, Mat& img, float& confThread, float& nmsTheshold){
	CommandLineParser parser(argc, argv, keys);
	if(parser.has("help")){
		parser.printMessage();
		return false;
	}
	if(!parser.check()){
		parser.printErrors();
		return false;
	}
	String imgFile = parser.get<String>(0);
	img = imread(imgFile);
	if(img.empty()){
		cout << "wrong image path ! please check again." << endl;
		return false;
	}
	confThread = parser.get<float>(1);
	nmsTheshold = parser.get<float>(2);
	return true;
}

int main(int argc, char** argv)
{	
	const char* keys  = {
        "{help h usage ? | | opecv based deep learining demo}"
        "{@image | | Image to process}"
        "{@confThreshold | 0.5 | confidence threshold, default 0.5}"
        "{@nmsThresshold | 0.3 | nms threshold, default 0.3}"};
	float confThreshold, nmsThreshold;
	Mat srcimg;
	if(!parseParam(argc, argv, keys, srcimg, confThreshold, nmsThreshold)){
        return 0;
    }
	yolact yolactnet(confThreshold, nmsThreshold);
	
	yolactnet.detect(srcimg);

	static const string kWinName = "Deep learning object detection in OpenCV";
	namedWindow(kWinName, WINDOW_AUTOSIZE);
	imshow(kWinName, srcimg);
	waitKey(0);
	destroyAllWindows();
}
// yolact.cpp
#include <fstream>
#include <iostream>
#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
#include "config.cpp"

using namespace cv;
using namespace dnn;
using namespace std;

class yolact
{
	public:
		yolact(float confThreshold, float nmsThreshold, const int keep_top_k = 200);
		void detect(Mat& srcimg);
	private:
		const int target_size = 550;
		const float MEANS[3] = { 123.68, 116.78, 103.94 };
		const float STD[3] = { 58.40, 57.12, 57.38 };
		float confidence_threshold;
		float nms_threshold;
		int keep_top_k;
		const int conv_ws[5] = { 69, 35, 18, 9, 5 };
		const int conv_hs[5] = { 69, 35, 18, 9, 5 };
		const float aspect_ratios[3] = { 1.f, 0.5f, 2.f };
		const float scales[5] = { 24.f, 48.f, 96.f, 192.f, 384.f };
		const float var[4] = { 0.1f, 0.1f, 0.2f, 0.2f };
		const int mask_h = 138;
		const int mask_w = 138;
		int num_priors;
		float* priorbox;
		Net net;
		void normalize(Mat& img);
		void sigmoid(Mat& out, int length);
};

yolact::yolact(float confThreshold, float nmsThreshold, const int keep_top_k)
{
	this->confidence_threshold = confThreshold;
	this->nms_threshold = nmsThreshold;
	this->keep_top_k = keep_top_k;
	this->net = readNet("yolact_base_54_800000.onnx");
	this->num_priors = 0;
	int p = 0;
	for (p = 0; p < 5; p++)
	{
		this->num_priors += this->conv_ws[p] * this->conv_hs[p] * 3;
	}
	this->priorbox = new float[4 * this->num_priors];
	generate priorbox
	float* pb = priorbox;
	for (p = 0; p < 5; p++)
	{
		int conv_w = this->conv_ws[p];
		int conv_h = this->conv_hs[p];

		float scale = this->scales[p];

		for (int i = 0; i < conv_h; i++)
		{
			for (int j = 0; j < conv_w; j++)
			{
				// +0.5 because priors are in center-size notation
				float cx = (j + 0.5f) / conv_w;
				float cy = (i + 0.5f) / conv_h;

				for (int k = 0; k < 3; k++)
				{
					float ar = aspect_ratios[k];

					ar = sqrt(ar);

					float w = scale * ar / this->target_size;
					float h = scale / ar / this->target_size;

					// This is for backward compatability with a bug where I made everything square by accident
					// cfg.backbone.use_square_anchors:
					h = w;
					pb[0] = cx;
					pb[1] = cy;
					pb[2] = w;
					pb[3] = h;
					pb += 4;
				}
			}
		}
	}
}

void yolact::normalize(Mat& img)
{
	img.convertTo(img, CV_32F);
	int i = 0, j = 0;
	for (i = 0; i < img.rows; i++)
	{
		float* pdata = (float*)(img.data + i * img.step);
		for (j = 0; j < img.cols; j++)
		{
			pdata[0] = (pdata[0] - this->MEANS[0]) / this->STD[0];
			pdata[1] = (pdata[1] - this->MEANS[1]) / this->STD[1];
			pdata[2] = (pdata[2] - this->MEANS[2]) / this->STD[2];
			pdata += 3;
		}
	}
}

void yolact::sigmoid(Mat& out, int length)
{
	float* pdata = (float*)(out.data);
	int i = 0;
	for (i = 0; i < length; i++)
	{
		pdata[i] = 1.0 / (1 + expf(-pdata[i]));
	}
}

void yolact::detect(Mat& srcimg)
{
	int img_w = srcimg.cols;
	int img_h = srcimg.rows;
	Mat img;
	resize(srcimg, img, Size(this->target_size, this->target_size), INTER_LINEAR);
	cvtColor(img, img, COLOR_BGR2RGB);
	this->normalize(img);
	Mat blob = blobFromImage(img);
	this->net.setInput(blob);
	vector<Mat> outs;
	this->net.forward(outs, this->net.getUnconnectedOutLayersNames());
	
	/generate proposals
	vector<int> classIds;
	vector<float> confidences;
	vector<Rect> boxes;
	vector<int> maskIds;
	const int num_class = outs[1].cols;
	for (int i = 0; i < this->num_priors; i++)
	{
		Mat scores = outs[1].row(i).colRange(1, num_class);
		Point classIdPoint;
		double score;
		// Get the value and location of the maximum score
		minMaxLoc(scores, 0, &score, 0, &classIdPoint);
		if (score > this->confidence_threshold)
		{
			const float* loc = (float*)outs[0].data + i * 4;
			const float* pb = this->priorbox + i * 4;
			float pb_cx = pb[0];
			float pb_cy = pb[1];
			float pb_w = pb[2];
			float pb_h = pb[3];

			float bbox_cx = var[0] * loc[0] * pb_w + pb_cx;
			float bbox_cy = var[1] * loc[1] * pb_h + pb_cy;
			float bbox_w = (float)(exp(var[2] * loc[2]) * pb_w);
			float bbox_h = (float)(exp(var[3] * loc[3]) * pb_h);
			float obj_x1 = bbox_cx - bbox_w * 0.5f;
			float obj_y1 = bbox_cy - bbox_h * 0.5f;
			float obj_x2 = bbox_cx + bbox_w * 0.5f;
			float obj_y2 = bbox_cy + bbox_h * 0.5f;

			// clip
			obj_x1 = max(min(obj_x1 * img_w, (float)(img_w - 1)), 0.f);
			obj_y1 = max(min(obj_y1 * img_h, (float)(img_h - 1)), 0.f);
			obj_x2 = max(min(obj_x2 * img_w, (float)(img_w - 1)), 0.f);
			obj_y2 = max(min(obj_y2 * img_h, (float)(img_h - 1)), 0.f);
			classIds.push_back(classIdPoint.x);
			confidences.push_back(score);
			boxes.push_back(Rect((int)obj_x1, (int)obj_y1, (int)(obj_x2 - obj_x1 + 1), (int)(obj_y2 - obj_y1 + 1)));
			maskIds.push_back(i);
		}
	}

	// Perform non maximum suppression to eliminate redundant overlapping boxes with
	// lower confidences
	vector<int> indices;
	NMSBoxes(boxes, confidences, this->confidence_threshold, this->nms_threshold, indices, 1.f, this->keep_top_k);
	for (size_t i = 0; i < indices.size(); ++i)
	{
		int idx = indices[i];
		Rect box = boxes[idx];
		int xmax = box.x + box.width;
		int ymax = box.y + box.height;
		rectangle(srcimg, Point(box.x, box.y), Point(xmax, ymax), Scalar(0, 0, 255), 3);
		//Get the label for the class name and its confidence
		char text[256];
		sprintf(text, "%s: %.2f", class_names[classIds[idx] + 1], confidences[idx]);


		//Display the label at the top of the bounding box
		int baseLine;
		Size labelSize = getTextSize(text, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
		int ymin = max(box.y, labelSize.height);
		//rectangle(frame, Point(left, top - int(1.5 * labelSize.height)), Point(left + int(1.5 * labelSize.width), top + baseLine), Scalar(0, 255, 0), FILLED);
		putText(srcimg, text, Point(box.x, ymin), FONT_HERSHEY_SIMPLEX, 0.75, Scalar(0, 255, 0), 1);

		Mat mask(this->mask_h, this->mask_w, CV_32FC1);
		mask = cv::Scalar(0.f);
		int channel = outs[2].cols;
		int area = this->mask_h * this->mask_w;
		float* coeff = (float*)outs[2].data + maskIds[idx] * channel;
		float* pm = (float*)mask.data;
		const float* pmaskmap = (float*)outs[3].data;
		for (int j = 0; j < area; j++)
		{
			for (int p = 0; p < channel; p++)
			{
				pm[j] += pmaskmap[p] * coeff[p];
			}
			pmaskmap += channel;
		}

		this->sigmoid(mask, area);
		Mat mask2;
		resize(mask, mask2, Size(img_w, img_h));
		// draw mask
		for (int y = 0; y < img_h; y++)
		{
			const float* pmask = (float*)mask2.data + y * img_w;
			uchar* p = srcimg.data + y * img_w * 3;
			for (int x = 0; x < img_w; x++)
			{
				if (pmask[x] > 0.5)
				{
					p[0] = (uchar)(p[0] * 0.5 + colors[classIds[idx] + 1][0] * 0.5);
					p[1] = (uchar)(p[1] * 0.5 + colors[classIds[idx] + 1][1] * 0.5);
					p[2] = (uchar)(p[2] * 0.5 + colors[classIds[idx] + 1][2] * 0.5);
				}
				p += 3;
			}
		}
	}
}
// config.hpp //
extern const char* class_names[];
extern const unsigned char colors[81][3];

// config.cpp //
#include"config.hpp"
extern const char* class_names[] = { "background",
										"person", "bicycle", "car", "motorcycle", "airplane", "bus",
										"train", "truck", "boat", "traffic light", "fire hydrant",
										"stop sign", "parking meter", "bench", "bird", "cat", "dog",
										"horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
										"backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
										"skis", "snowboard", "sports ball", "kite", "baseball bat",
										"baseball glove", "skateboard", "surfboard", "tennis racket",
										"bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
										"banana", "apple", "sandwich", "orange", "broccoli", "carrot",
										"hot dog", "pizza", "donut", "cake", "chair", "couch",
										"potted plant", "bed", "dining table", "toilet", "tv", "laptop",
										"mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
										"toaster", "sink", "refrigerator", "book", "clock", "vase",
										"scissors", "teddy bear", "hair drier", "toothbrush"
};

extern const unsigned char colors[81][3] = {{56, 0, 255}, {226, 255, 0}, {0, 94, 255},
	{0, 37, 255}, {0, 255, 94}, {255, 226, 0}, {0, 18, 255}, {255, 151, 0},
	{170, 0, 255}, {0, 255, 56}, {255, 0, 75}, {0, 75, 255}, {0, 255, 169},
	{255, 0, 207}, {75, 255, 0}, {207, 0, 255}, {37, 0, 255}, {0, 207, 255},
	{94, 0, 255}, {0, 255, 113}, {255, 18, 0}, {255, 0, 56}, {18, 0, 255},
	{0, 255, 226}, {170, 255, 0}, {255, 0, 245}, {151, 255, 0}, {132, 255, 0},
	{75, 0, 255}, {151, 0, 255}, {0, 151, 255}, {132, 0, 255}, {0, 255, 245},
	{255, 132, 0}, {226, 0, 255}, {255, 37, 0}, {207, 255, 0},
	{0, 255, 207}, {94, 255, 0}, {0, 226, 255},
	{56, 255, 0}, {255, 94, 0}, {255, 113, 0},{0, 132, 255}, {255, 0, 132},
	{255, 170, 0}, {255, 0, 188}, {113, 255, 0}, {245, 0, 255}, {113, 0, 255},
	{255, 188, 0}, {0, 113, 255}, {255, 0, 0}, {0, 56, 255}, {255, 0, 113},
	{0, 255, 188}, {255, 0, 94}, {255, 0, 18}, {18, 255, 0}, {0, 255, 132},
	{0, 188, 255}, {0, 245, 255}, {0, 169, 255},{37, 255, 0},
	{255, 0, 151}, {188, 0, 255}, {0, 255, 37}, {0, 255, 0},
	{255, 0, 170}, {255, 0, 37}, {255, 75, 0}, {0, 0, 255}, {255, 207, 0},
	{255, 0, 226}, {255, 245, 0}, {188, 255, 0}, {0, 255, 18}, {0, 255, 75},
	{0, 255, 151}, {255, 56, 0}, {245, 255, 0}
};
<think>嗯,用户想了解YOLOv5-Seg的网络结构及组成。我需要先回忆一下YOLOv5的基本结构,然后再看看分割版本YOLOv5-Seg有什么不同。根据之前提供的引用内容,特别是引用[2]和引用[3],里面提到了YOLOv8的分割结构,可能和YOLOv5-Seg有相似之处。比如,YOLOv8-seg的组成部分包括输入端、Backbone、Neck和Head,其中分割分支基于YOLACT架构。而YOLOv5-Seg可能也采用了类似的结构,但具体模块可能会有不同。 首先,YOLOv5本身的结构分为Backbone、Neck和Head。Backbone负责特征提取,通常使用CSPDarknet53,而YOLOv5-Seg作为分割模型,应该是在检测的基础上增加了分割头。根据引用[2],YOLOv8-seg用C2f模块替代了YOLOv5的C3模块,所以YOLOv5的C3模块可能是一个关键点。另外,分割模型需要生成掩膜,可能借鉴了YOLACT的方法,即通过原型生成和预测掩膜系数来组合实例分割结果。 用户的问题是关于YOLOv5-Seg的结构详解,所以需要分部分讲解:输入端、Backbone、Neck、Head(检测和分割分支)。输入端的数据增强方法,比如马赛克增强、颜色扰动等,这些在YOLO系列中常见,可能YOLOv5-Seg也用了类似的技术。Backbone部分,YOLOv5的CSPDarknet53结构,其中的Focus模块可能已经被替换为卷积层,因为YOLOv5后期版本移除了Focus以减少计算量。C3模块是CSP结构的变种,包含三个卷积层和多个Bottleneck模块,用于特征提取和减少计算量。 Neck部分,YOLOv5使用PANet进行特征融合,结合自上而下和自下而上的路径,增强多尺度检测能力。分割模型可能需要更精细的特征,所以Neck的结构可能和检测版本类似,但输出更多尺度的特征。 Head部分,检测头负责目标检测,而分割头则生成掩膜。根据YOLACT的思路,分割头可能包括原型生成和预测每个实例的掩膜系数。在YOLOv5-Seg中,分割分支可能位于检测头之后,利用检测到的框信息来生成对应的掩膜。具体来说,分割头可能输出原型掩膜和每个锚点的系数,然后通过矩阵乘法得到实例掩膜。 另外,引用[3]提到YOLOv5-Seg的C++实现,可能涉及到后处理步骤,比如使用OpenCV和ONNX Runtime进行推理后处理可能包括NMS(非极大值抑制)以及掩膜的处理,比如上采样、阈值化等。 需要确认YOLOv5-Seg的具体结构是否有官方文档或论文支持,但根据已有的引用和常见结构,可以推断其组成。可能需要指出不同版本YOLOv5-Seg的差异,比如是否使用C3模块,或者是否有其他改进。同时,需要注意正确引用提供的资料,比如引用[2]和引用[3]中的相关内容。 总结,YOLOv5-Seg的网络结构分为输入端、Backbone、Neck、检测头和分割头。输入端使用数据增强技术,Backbone提取特征,Neck进行多尺度融合,检测头输出框和类别,分割头生成掩膜。分割部分可能基于原型生成的方法,结合检测结果进行实例分割。</think>### YOLOv5-Seg 网络结构详解 YOLOv5-Seg 是基于 YOLOv5 检测框架扩展的实例分割模型,其网络结构主要分为**输入端**、**Backbone(骨干网络)**、**Neck(特征融合层)** 和 **Head(检测与分割头)** 四部分,整体采用端到端设计[^2]。 --- #### 1. **输入端(Input)** - **输入分辨率**:支持多种尺寸(如 640×640),通过自适应填充保持宽高比。 - **数据增强**: - **马赛克增强(Mosaic)**:随机拼接四张图像,提升小目标检测能力。 - **色彩空间变换**:调整亮度、对比度、饱和度,增强鲁棒性[^2]。 --- #### 2. **Backbone(骨干网络)** YOLOv5-Seg 骨干网络基于 **CSPDarknet53**,核心模块包括: - **C3 模块**(替代 CSP 结构): - 由 3 个卷积层和多个 **Bottleneck** 组成,通过跨阶段连接减少计算量。 - 公式:输出特征图 $F_{\text{out}} = \text{Conv}(\text{Concat}(F_{\text{main}}, F_{\text{res}}))$,其中 $F_{\text{res}}$ 为残差路径特征。 - **Focus 模块**(早期版本): - 切片操作将通道数扩展 4 倍,后接卷积层(新版本已替换为标准卷积)。 --- #### 3. **Neck(特征融合层)** 采用 **PANet(Path Aggregation Network)**: - **自上而下 + 自下而上双向融合**:整合深层语义信息与浅层细节特征。 - **多尺度输出**:生成 3 种尺度的特征图(如 80×80、40×40、20×20),适应不同尺寸目标[^2]。 --- #### 4. **Head(检测与分割头)** ##### 4.1 **检测头(Detection Head)** - **锚框机制**:每个尺度预测 3 个锚框,输出边界框坐标 $(x,y,w,h)$、置信度及类别概率。 - **损失函数**: - 分类损失:交叉熵损失。 - 定位损失:CIoU Loss,公式:$L_{\text{CIoU}} = 1 - \text{IoU} + \frac{\rho^2(b_{\text{pred}}, b_{\text{gt}})}{c^2} + \alpha v$,其中 $v$ 为长宽比惩罚项。 ##### 4.2 **分割头(Segmentation Head)** - **原型掩膜生成**: - 输出 $k$ 个原型掩膜(Prototype Masks),尺寸为 $56×56$,覆盖常见目标形状。 - **掩膜系数预测**: - 检测头为每个实例预测 $k$ 维系数,通过矩阵乘法生成实例掩膜: $$M = \sigma\left(\sum_{i=1}^k c_i P_i\right)$$ 其中 $c_i$ 为系数,$P_i$ 为原型掩膜,$\sigma$ 为 Sigmoid 函数。 - **后处理**: - 使用检测框裁剪掩膜,通过双线性插值还原到原图尺寸。 - 结合非极大值抑制(NMS)过滤冗余结果[^3]。 --- #### 代码示例(简化版网络结构) ```python # Backbone (CSPDarknet53) backbone = CSPDarknet53() # Neck (PANet) neck = PANet(in_channels=[256, 512, 1024], out_channels=[128, 256, 512]) # Head detection_head = DetectionHead(num_classes=80, anchors=[[10,13], [16,30], [33,23]]) segmentation_head = SegmentationHead(proto_channels=32, mask_dim=32) def forward(x): features = backbone(x) fused_features = neck(features) det_output = detection_head(fused_features) seg_output = segmentation_head(fused_features) return det_output, seg_output ``` ---
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值