目录
目标检测算法简介
目标检测算法是一类计算机视觉技术,用于识别和定位图像或视频中的目标对象。常见的目标检测算法包括:
1. R-CNN (Region-based Convolutional Neural Networks):
- R-CNN: 通过选择性搜索生成候选区域,然后使用卷积神经网络(CNN)对每个候选区域进行分类。
- Fast R-CNN: 改进了R-CNN,通过共享卷积特征来加速处理。
- Faster R-CNN: 进一步改进了Fast R-CNN,引入了区域建议网络(RPN)来生成候选区域。
示例代码:
#include <opencv2/opencv.hpp>
#include <opencv2/dnn.hpp>
#include <iostream>
#include <fstream>
using namespace cv;
using namespace dnn;
using namespace std;
void detectObjects(Mat& frame, Net& net, const vector<string>& classNames) {
Mat blob;
blobFromImage(frame, blob, 1.0, Size(224, 224), Scalar(104, 117, 123), false, false);
net.setInput(blob);
Mat prob = net.forward();
double confidence;
Point classIdPoint;
minMaxLoc(prob.reshape(1, 1), 0, &confidence, 0, &classIdPoint);
int classId = classIdPoint.x;
if (confidence > 0.5) {
putText(frame, classNames[classId], Point(10, 30), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 255, 0), 2);
cout << "Detected: " << classNames[classId] << " with confidence: " << confidence << endl;
}
}
int main() {
string modelConfiguration = "deploy.prototxt";
string modelWeights = "bvlc_reference_caffenet.caffemodel";
string classesFile = "synset_words.txt";
Net net = readNetFromCaffe(modelConfiguration, modelWeights);
vector<string> classNames;
ifstream ifs(classesFile.c_str());
string line;
while (getline(ifs, line)) classNames.push_back(line);
VideoCapture cap(0);
if (!cap.isOpened()) {
cout << "Error opening video stream" << endl;
return -1;
}
while (true) {
Mat frame;
cap >> frame;
if (frame.empty()) break;
detectObjects(frame, net, classNames);
imshow("R-CNN Object Detection", frame);
if (waitKey(1) == 27) break; // Press 'ESC' to exit
}
cap.release();
destroyAllWindows();
return 0;
}
2. YOLO (You Only Look Once):
- YOLO将目标检测问题视为一个回归问题,直接在图像上预测边界框和类别概率。YOLO的主要优点是速度快,适合实时应用。
#include <opencv2/opencv.hpp>
#include <opencv2/dnn.hpp>
#include <iostream>
#include <fstream>
using namespace cv;
using namespace dnn;
using namespace std;
void detectObjects(Mat& frame, Net& net, const vector<string>& classNames) {
Mat blob;
blobFromImage(frame, blob, 1/255.0, Size(416, 416), Scalar(), true, false);
net.setInput(blob);
vector<Mat> outs;
net.forward(outs, net.getUnconnectedOutLayersNames());
float confThreshold = 0.5;
for (size_t i = 0; i < outs.size(); ++i) {
float* data = (float*)outs[i].data;
for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols) {
Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
Point classIdPoint;
double confidence;
minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
if (confidence > confThreshold) {
int centerX = (int)(data[0] * frame.cols);
int centerY = (int)(data[1] * frame.rows);
int width = (int)(data[2] * frame.cols);
int height = (int)(data[3] * frame.rows);
int left = centerX - width / 2;
int top = centerY - height / 2;
rectangle(frame, Point(left, top), Point(left + width, top + height), Scalar(0, 255, 0), 3);
putText(frame, classNames[classIdPoint.x], Point(left, top - 10), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0), 2);
}
}
}
}
int main() {
string modelConfiguration = "yolov3.cfg";
string modelWeights = "yolov3.weights";
string classesFile = "coco.names";
Net net = readNetFromDarknet(modelConfiguration, modelWeights);
net.setPreferableBackend(DNN_BACKEND_OPENCV);
net.setPreferableTarget(DNN_TARGET_CPU);
vector<string> classNames;
ifstream ifs(classesFile.c_str());
string line;
while (getline(ifs, line)) classNames.push_back(line);
VideoCapture cap(0);
if (!cap.isOpened()) {
cout << "Error opening video stream" << endl;
return -1;
}
while (true) {
Mat frame;
cap >> frame;
if (frame.empty()) break;
detectObjects(frame, net, classNames);
imshow("YOLO Object Detection", frame);
if (waitKey(1) == 27) break; // Press 'ESC' to exit
}
cap.release();
destroyAllWindows();
return 0;
}
3. SSD (Single Shot MultiBox Detector):
- SSD在不同尺度的特征图上进行检测,能够同时预测多个类别和边界框。SSD在速度和精度之间取得了良好的平衡。
#include <opencv2/opencv.hpp>
#include <opencv2/dnn.hpp>
#include <iostream>
#include <fstream>
using namespace cv;
using namespace dnn;
using namespace std;
void detectObjects(Mat& frame, Net& net, const vector<string>& classNames) {
Mat blob;
blobFromImage(frame, blob, 0.007843, Size(300, 300), 127.5, false, false);
net.setInput(blob);
Mat detections = net.forward();
float confidenceThreshold = 0.5;
for (int i = 0; i < detections.size[2]; ++i) {
float confidence = detections.at<float>(0, 0, i, 2);
if (confidence > confidenceThreshold) {
int classId = static_cast<int>(detections.at<float>(0, 0, i, 1));
int left = static_cast<int>(detections.at<float>(0, 0, i, 3) * frame.cols);
int top = static_cast<int>(detections.at<float>(0, 0, i, 4) * frame.rows);
int right = static_cast<int>(detections.at<float>(0, 0, i, 5) * frame.cols);
int bottom = static_cast<int>(detections.at<float>(0, 0, i, 6) * frame.rows);
rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 255, 0), 2);
putText(frame, classNames[classId], Point(left, top - 10), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0), 2);
}
}
}
int main() {
string modelConfiguration = "deploy.prototxt";
string modelWeights = "VGG_VOC0712_SSD_300x300_iter_120000.caffemodel";
string classesFile = "coco.names";
Net net = readNetFromCaffe(modelConfiguration, modelWeights);
vector<string> classNames;
ifstream ifs(classesFile.c_str());
string line;
while (getline(ifs, line)) classNames.push_back(line);
VideoCapture cap(0);
if (!cap.isOpened()) {
cout << "Error opening video stream" << endl;
return -1;
}
while (true) {
Mat frame;
cap >> frame;
if (frame.empty()) break;
detectObjects(frame, net, classNames);
imshow("SSD Object Detection", frame);
if (waitKey(1) == 27) break; // Press 'ESC' to exit
}
cap.release();
destroyAllWindows();
return 0;
}
4. RetinaNet:
- RetinaNet引入了Focal Loss来处理类别不平衡问题,特别适用于检测小目标。
#include <torch/script.h> // One-stop header.
#include <opencv2/opencv.hpp>
#include <iostream>
#include <memory>
using namespace cv;
using namespace std;
void detectObjects(Mat& frame, torch::jit::script::Module& module, const vector<string>& classNames) {
// Preprocess the image
Mat img;
cv::resize(frame, img, Size(800, 800));
img.convertTo(img, CV_32F, 1.0 / 255);
auto input_tensor = torch::from_blob(img.data, {1, img.rows, img.cols, 3});
input_tensor = input_tensor.permute({0, 3, 1, 2});
input_tensor = input_tensor.to(torch::kCUDA);
// Forward pass
auto output = module.forward({input_tensor}).toTuple();
// Postprocess the output
auto detections = output->elements()[0].toTensor().to(torch::kCPU);
auto scores = output->elements()[1].toTensor().to(torch::kCPU);
auto labels = output->elements()[2].toTensor().to(torch::kCPU);
float confidenceThreshold = 0.5;
for (int i = 0; i < detections.size(0); ++i) {
if (scores[i].item<float>() > confidenceThreshold) {
int classId = labels[i].item<int>();
auto box = detections[i];
int left = static_cast<int>(box[0].item<float>() * frame.cols);
int top = static_cast<int>(box[1].item<float>() * frame.rows);
int right = static_cast<int>(box[2].item<float>() * frame.cols);
int bottom = static_cast<int>(box[3].item<float>() * frame.rows);
rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 255, 0), 2);
putText(frame, classNames[classId], Point(left, top - 10), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0), 2);
}
}
}
int main() {
// Load the model
string modelPath = "retinanet_model.pt";
torch::jit::script::Module module;
try {
module = torch::jit::load(modelPath);
module.to(torch::kCUDA);
}
catch (const c10::Error& e) {
cerr << "Error loading the model\n";
return -1;
}
// Load class names
string classesFile = "coco.names";
vector<string> classNames;
ifstream ifs(classesFile.c_str());
string line;
while (getline(ifs, line)) classNames.push_back(line);
// Open video capture
VideoCapture cap(0);
if (!cap.isOpened()) {
cout << "Error opening video stream" << endl;
return -1;
}
while (true) {
Mat frame;
cap >> frame;
if (frame.empty()) break;
detectObjects(frame, module, classNames);
imshow("RetinaNet Object Detection", frame);
if (waitKey(1) == 27) break; // Press 'ESC' to exit
}
cap.release();
destroyAllWindows();
return 0;
}
5. Mask R-CNN:
- Mask R-CNN是Faster R-CNN的扩展,不仅可以进行目标检测,还可以进行实例分割。
这些算法通常使用深度学习框架(如TensorFlow、PyTorch)进行实现,并且需要大量标注数据进行训练。选择合适的目标检测算法取决于具体应用场景的需求,如实时性、精度和计算资源等。
#include <torch/script.h> // One-stop header.
#include <opencv2/opencv.hpp>
#include <iostream>
#include <memory>
using namespace cv;
using namespace std;
void detectObjects(Mat& frame, torch::jit::script::Module& module, const vector<string>& classNames) {
// Preprocess the image
Mat img;
cv::resize(frame, img, Size(800, 800));
img.convertTo(img, CV_32F, 1.0 / 255);
auto input_tensor = torch::from_blob(img.data, {1, img.rows, img.cols, 3});
input_tensor = input_tensor.permute({0, 3, 1, 2});
input_tensor = input_tensor.to(torch::kCUDA);
// Forward pass
auto output = module.forward({input_tensor}).toTuple();
// Postprocess the output
auto boxes = output->elements()[0].toTensor().to(torch::kCPU);
auto labels = output->elements()[1].toTensor().to(torch::kCPU);
auto scores = output->elements()[2].toTensor().to(torch::kCPU);
auto masks = output->elements()[3].toTensor().to(torch::kCPU);
float confidenceThreshold = 0.5;
for (int i = 0; i < boxes.size(0); ++i) {
if (scores[i].item<float>() > confidenceThreshold) {
int classId = labels[i].item<int>();
auto box = boxes[i];
int left = static_cast<int>(box[0].item<float>() * frame.cols);
int top = static_cast<int>(box[1].item<float>() * frame.rows);
int right = static_cast<int>(box[2].item<float>() * frame.cols);
int bottom = static_cast<int>(box[3].item<float>() * frame.rows);
rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 255, 0), 2);
putText(frame, classNames[classId], Point(left, top - 10), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0), 2);
// Extract and draw the mask
auto mask = masks[i][classId];
Mat maskMat(Size(mask.size(1), mask.size(0)), CV_32F, mask.data_ptr<float>());
resize(maskMat, maskMat, Size(right - left, bottom - top));
Mat coloredRoi = frame(Rect(left, top, right - left, bottom - top));
coloredRoi.setTo(Scalar(0, 0, 255), maskMat > 0.5);
}
}
}
int main() {
// Load the model
string modelPath = "mask_rcnn_model.pt";
torch::jit::script::Module module;
try {
module = torch::jit::load(modelPath);
module.to(torch::kCUDA);
}
catch (const c10::Error& e) {
cerr << "Error loading the model\n";
return -1;
}
// Load class names
string classesFile = "coco.names";
vector<string> classNames;
ifstream ifs(classesFile.c_str());
string line;
while (getline(ifs, line)) classNames.push_back(line);
// Open video capture
VideoCapture cap(0);
if (!cap.isOpened()) {
cout << "Error opening video stream" << endl;
return -1;
}
while (true) {
Mat frame;
cap >> frame;
if (frame.empty()) break;
detectObjects(frame, module, classNames);
imshow("Mask R-CNN Object Detection", frame);
if (waitKey(1) == 27) break; // Press 'ESC' to exit
}
cap.release();
destroyAllWindows();
return 0;
}