目标检测算法简介

申徒嘉

于 2024-08-05 09:04:13 发布

阅读量270

点赞数 16

分类专栏：人工智能文章标签：文心一言目标检测

本文链接：https://blog.csdn.net/ChuJian_cao/article/details/140917430

版权

人工智能专栏收录该内容

1 篇文章 0 订阅

订阅专栏

目标检测算法简介

目标检测算法是一类计算机视觉技术，用于识别和定位图像或视频中的目标对象。常见的目标检测算法包括：

1. R-CNN (Region-based Convolutional Neural Networks):

R-CNN: 通过选择性搜索生成候选区域，然后使用卷积神经网络（CNN）对每个候选区域进行分类。
Fast R-CNN: 改进了R-CNN，通过共享卷积特征来加速处理。
Faster R-CNN: 进一步改进了Fast R-CNN，引入了区域建议网络（RPN）来生成候选区域。

示例代码：

#include <opencv2/opencv.hpp>
#include <opencv2/dnn.hpp>
#include <iostream>
#include <fstream>

using namespace cv;
using namespace dnn;
using namespace std;

void detectObjects(Mat& frame, Net& net, const vector<string>& classNames) {
    Mat blob;
    blobFromImage(frame, blob, 1.0, Size(224, 224), Scalar(104, 117, 123), false, false);
    net.setInput(blob);
    Mat prob = net.forward();

    double confidence;
    Point classIdPoint;
    minMaxLoc(prob.reshape(1, 1), 0, &confidence, 0, &classIdPoint);
    int classId = classIdPoint.x;

    if (confidence > 0.5) {
        putText(frame, classNames[classId], Point(10, 30), FONT_HERSHEY_SIMPLEX, 1, Scalar(0, 255, 0), 2);
        cout << "Detected: " << classNames[classId] << " with confidence: " << confidence << endl;
    }
}

int main() {
    string modelConfiguration = "deploy.prototxt";
    string modelWeights = "bvlc_reference_caffenet.caffemodel";
    string classesFile = "synset_words.txt";

    Net net = readNetFromCaffe(modelConfiguration, modelWeights);

    vector<string> classNames;
    ifstream ifs(classesFile.c_str());
    string line;
    while (getline(ifs, line)) classNames.push_back(line);

    VideoCapture cap(0);
    if (!cap.isOpened()) {
        cout << "Error opening video stream" << endl;
        return -1;
    }

    while (true) {
        Mat frame;
        cap >> frame;
        if (frame.empty()) break;

        detectObjects(frame, net, classNames);

        imshow("R-CNN Object Detection", frame);
        if (waitKey(1) == 27) break; // Press 'ESC' to exit
    }

    cap.release();
    destroyAllWindows();
    return 0;
}

2. YOLO (You Only Look Once):

YOLO将目标检测问题视为一个回归问题，直接在图像上预测边界框和类别概率。YOLO的主要优点是速度快，适合实时应用。

#include <opencv2/opencv.hpp>
#include <opencv2/dnn.hpp>
#include <iostream>
#include <fstream>

using namespace cv;
using namespace dnn;
using namespace std;

void detectObjects(Mat& frame, Net& net, const vector<string>& classNames) {
    Mat blob;
    blobFromImage(frame, blob, 1/255.0, Size(416, 416), Scalar(), true, false);
    net.setInput(blob);
    vector<Mat> outs;
    net.forward(outs, net.getUnconnectedOutLayersNames());

    float confThreshold = 0.5;
    for (size_t i = 0; i < outs.size(); ++i) {
        float* data = (float*)outs[i].data;
        for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols) {
            Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
            Point classIdPoint;
            double confidence;
            minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
            if (confidence > confThreshold) {
                int centerX = (int)(data[0] * frame.cols);
                int centerY = (int)(data[1] * frame.rows);
                int width = (int)(data[2] * frame.cols);
                int height = (int)(data[3] * frame.rows);
                int left = centerX - width / 2;
                int top = centerY - height / 2;

                rectangle(frame, Point(left, top), Point(left + width, top + height), Scalar(0, 255, 0), 3);
                putText(frame, classNames[classIdPoint.x], Point(left, top - 10), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0), 2);
            }
        }
    }
}

int main() {
    string modelConfiguration = "yolov3.cfg";
    string modelWeights = "yolov3.weights";
    string classesFile = "coco.names";

    Net net = readNetFromDarknet(modelConfiguration, modelWeights);
    net.setPreferableBackend(DNN_BACKEND_OPENCV);
    net.setPreferableTarget(DNN_TARGET_CPU);

    vector<string> classNames;
    ifstream ifs(classesFile.c_str());
    string line;
    while (getline(ifs, line)) classNames.push_back(line);

    VideoCapture cap(0);
    if (!cap.isOpened()) {
        cout << "Error opening video stream" << endl;
        return -1;
    }

    while (true) {
        Mat frame;
        cap >> frame;
        if (frame.empty()) break;

        detectObjects(frame, net, classNames);

        imshow("YOLO Object Detection", frame);
        if (waitKey(1) == 27) break; // Press 'ESC' to exit
    }

    cap.release();
    destroyAllWindows();
    return 0;
}

3. SSD (Single Shot MultiBox Detector):

SSD在不同尺度的特征图上进行检测，能够同时预测多个类别和边界框。SSD在速度和精度之间取得了良好的平衡。

#include <opencv2/opencv.hpp>
#include <opencv2/dnn.hpp>
#include <iostream>
#include <fstream>

using namespace cv;
using namespace dnn;
using namespace std;

void detectObjects(Mat& frame, Net& net, const vector<string>& classNames) {
    Mat blob;
    blobFromImage(frame, blob, 0.007843, Size(300, 300), 127.5, false, false);
    net.setInput(blob);
    Mat detections = net.forward();

    float confidenceThreshold = 0.5;
    for (int i = 0; i < detections.size[2]; ++i) {
        float confidence = detections.at<float>(0, 0, i, 2);
        if (confidence > confidenceThreshold) {
            int classId = static_cast<int>(detections.at<float>(0, 0, i, 1));
            int left = static_cast<int>(detections.at<float>(0, 0, i, 3) * frame.cols);
            int top = static_cast<int>(detections.at<float>(0, 0, i, 4) * frame.rows);
            int right = static_cast<int>(detections.at<float>(0, 0, i, 5) * frame.cols);
            int bottom = static_cast<int>(detections.at<float>(0, 0, i, 6) * frame.rows);

            rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 255, 0), 2);
            putText(frame, classNames[classId], Point(left, top - 10), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0), 2);
        }
    }
}

int main() {
    string modelConfiguration = "deploy.prototxt";
    string modelWeights = "VGG_VOC0712_SSD_300x300_iter_120000.caffemodel";
    string classesFile = "coco.names";

    Net net = readNetFromCaffe(modelConfiguration, modelWeights);

    vector<string> classNames;
    ifstream ifs(classesFile.c_str());
    string line;
    while (getline(ifs, line)) classNames.push_back(line);

    VideoCapture cap(0);
    if (!cap.isOpened()) {
        cout << "Error opening video stream" << endl;
        return -1;
    }

    while (true) {
        Mat frame;
        cap >> frame;
        if (frame.empty()) break;

        detectObjects(frame, net, classNames);

        imshow("SSD Object Detection", frame);
        if (waitKey(1) == 27) break; // Press 'ESC' to exit
    }

    cap.release();
    destroyAllWindows();
    return 0;
}

4. RetinaNet:

RetinaNet引入了Focal Loss来处理类别不平衡问题，特别适用于检测小目标。

#include <torch/script.h> // One-stop header.
#include <opencv2/opencv.hpp>
#include <iostream>
#include <memory>

using namespace cv;
using namespace std;

void detectObjects(Mat& frame, torch::jit::script::Module& module, const vector<string>& classNames) {
    // Preprocess the image
    Mat img;
    cv::resize(frame, img, Size(800, 800));
    img.convertTo(img, CV_32F, 1.0 / 255);
    auto input_tensor = torch::from_blob(img.data, {1, img.rows, img.cols, 3});
    input_tensor = input_tensor.permute({0, 3, 1, 2});
    input_tensor = input_tensor.to(torch::kCUDA);

    // Forward pass
    auto output = module.forward({input_tensor}).toTuple();

    // Postprocess the output
    auto detections = output->elements()[0].toTensor().to(torch::kCPU);
    auto scores = output->elements()[1].toTensor().to(torch::kCPU);
    auto labels = output->elements()[2].toTensor().to(torch::kCPU);

    float confidenceThreshold = 0.5;
    for (int i = 0; i < detections.size(0); ++i) {
        if (scores[i].item<float>() > confidenceThreshold) {
            int classId = labels[i].item<int>();
            auto box = detections[i];
            int left = static_cast<int>(box[0].item<float>() * frame.cols);
            int top = static_cast<int>(box[1].item<float>() * frame.rows);
            int right = static_cast<int>(box[2].item<float>() * frame.cols);
            int bottom = static_cast<int>(box[3].item<float>() * frame.rows);

            rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 255, 0), 2);
            putText(frame, classNames[classId], Point(left, top - 10), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0), 2);
        }
    }
}

int main() {
    // Load the model
    string modelPath = "retinanet_model.pt";
    torch::jit::script::Module module;
    try {
        module = torch::jit::load(modelPath);
        module.to(torch::kCUDA);
    }
    catch (const c10::Error& e) {
        cerr << "Error loading the model\n";
        return -1;
    }

    // Load class names
    string classesFile = "coco.names";
    vector<string> classNames;
    ifstream ifs(classesFile.c_str());
    string line;
    while (getline(ifs, line)) classNames.push_back(line);

    // Open video capture
    VideoCapture cap(0);
    if (!cap.isOpened()) {
        cout << "Error opening video stream" << endl;
        return -1;
    }

    while (true) {
        Mat frame;
        cap >> frame;
        if (frame.empty()) break;

        detectObjects(frame, module, classNames);

        imshow("RetinaNet Object Detection", frame);
        if (waitKey(1) == 27) break; // Press 'ESC' to exit
    }

    cap.release();
    destroyAllWindows();
    return 0;
}

5. Mask R-CNN:

Mask R-CNN是Faster R-CNN的扩展，不仅可以进行目标检测，还可以进行实例分割。
这些算法通常使用深度学习框架（如TensorFlow、PyTorch）进行实现，并且需要大量标注数据进行训练。选择合适的目标检测算法取决于具体应用场景的需求，如实时性、精度和计算资源等。

#include <torch/script.h> // One-stop header.
#include <opencv2/opencv.hpp>
#include <iostream>
#include <memory>

using namespace cv;
using namespace std;

void detectObjects(Mat& frame, torch::jit::script::Module& module, const vector<string>& classNames) {
    // Preprocess the image
    Mat img;
    cv::resize(frame, img, Size(800, 800));
    img.convertTo(img, CV_32F, 1.0 / 255);
    auto input_tensor = torch::from_blob(img.data, {1, img.rows, img.cols, 3});
    input_tensor = input_tensor.permute({0, 3, 1, 2});
    input_tensor = input_tensor.to(torch::kCUDA);

    // Forward pass
    auto output = module.forward({input_tensor}).toTuple();

    // Postprocess the output
    auto boxes = output->elements()[0].toTensor().to(torch::kCPU);
    auto labels = output->elements()[1].toTensor().to(torch::kCPU);
    auto scores = output->elements()[2].toTensor().to(torch::kCPU);
    auto masks = output->elements()[3].toTensor().to(torch::kCPU);

    float confidenceThreshold = 0.5;
    for (int i = 0; i < boxes.size(0); ++i) {
        if (scores[i].item<float>() > confidenceThreshold) {
            int classId = labels[i].item<int>();
            auto box = boxes[i];
            int left = static_cast<int>(box[0].item<float>() * frame.cols);
            int top = static_cast<int>(box[1].item<float>() * frame.rows);
            int right = static_cast<int>(box[2].item<float>() * frame.cols);
            int bottom = static_cast<int>(box[3].item<float>() * frame.rows);

            rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 255, 0), 2);
            putText(frame, classNames[classId], Point(left, top - 10), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0), 2);

            // Extract and draw the mask
            auto mask = masks[i][classId];
            Mat maskMat(Size(mask.size(1), mask.size(0)), CV_32F, mask.data_ptr<float>());
            resize(maskMat, maskMat, Size(right - left, bottom - top));
            Mat coloredRoi = frame(Rect(left, top, right - left, bottom - top));
            coloredRoi.setTo(Scalar(0, 0, 255), maskMat > 0.5);
        }
    }
}

int main() {
    // Load the model
    string modelPath = "mask_rcnn_model.pt";
    torch::jit::script::Module module;
    try {
        module = torch::jit::load(modelPath);
        module.to(torch::kCUDA);
    }
    catch (const c10::Error& e) {
        cerr << "Error loading the model\n";
        return -1;
    }

    // Load class names
    string classesFile = "coco.names";
    vector<string> classNames;
    ifstream ifs(classesFile.c_str());
    string line;
    while (getline(ifs, line)) classNames.push_back(line);

    // Open video capture
    VideoCapture cap(0);
    if (!cap.isOpened()) {
        cout << "Error opening video stream" << endl;
        return -1;
    }

    while (true) {
        Mat frame;
        cap >> frame;
        if (frame.empty()) break;

        detectObjects(frame, module, classNames);

        imshow("Mask R-CNN Object Detection", frame);
        if (waitKey(1) == 27) break; // Press 'ESC' to exit
    }

    cap.release();
    destroyAllWindows();
    return 0;
}