开发环境
VS2013 + OpenCV3.4.1 + Qt5.8.0
实验准备
VGG_VOC0712_SSD_300x300_iter_120000.caffemodel
deploy.prototxt
以上数据可以由这里下载得到(或者直接下载本工程),使用的是基于Caffe+VOC0712数据集训练出来的caffemodel。
主要代码
读取网络
string modelTxt = "caffe_ssd_300x300/deploy.prototxt";
string modelBin = "caffe_ssd_300x300/VGG_VOC0712_SSD_300x300_iter_120000.caffemodel";
try{
net = dnn::readNetFromCaffe(modelTxt, modelBin);
}
catch (cv::Exception &ee){
QMessageBox::warning(this, "Exception", ee.what());
if (net.empty()){
QMessageBox::warning(this, "Exception", "Can't load the network by using the flowing files.");
return;
}
}
前向识别
Mat frame;
image.copyTo(frame);
if (frame.empty()){
QMessageBox::warning(this, "Warning", "image is empty, please check!");
return;
}
if (frame.channels() == 4) cvtColor(frame, frame, COLOR_BGRA2BGR);
double ttt = (double)cvGetTickCount();
Mat inputBlob = blobFromImage(frame, 1.0f, Size(300, 300), Scalar(104, 117, 123), false, false); //Convert Mat to batch of images
net.setInput(inputBlob, "data"); //set the network input
Mat detection = net.forward("detection_out"); //compute output
ostringstream ss;
vector<double> layersTimings;
Mat detectionMat(detection.size[2], detection.size[3], CV_32F, detection.ptr<float>());
ttt = (double)cvGetTickCount() - ttt;
ui.labelTime->setText(toChinese("识别时间:") + QString::number(ttt / (cvGetTickFrequency() * 1000000)) + toChinese("秒"));
float confidenceThreshold = ui.dsbConfidence->value();
for (int i = 0; i < detectionMat.rows; i++){
float confidence = detectionMat.at<float>(i, 2);
if (confidence > confidenceThreshold){
size_t objectClass = (size_t)(detectionMat.at<float>(i, 1));
int xLeftBottom = static_cast<int>(detectionMat.at<float>(i, 3) * frame.cols);
int yLeftBottom = static_cast<int>(detectionMat.at<float>(i, 4) * frame.rows);
int xRightTop = static_cast<int>(detectionMat.at<float>(i, 5) * frame.cols);
int yRightTop = static_cast<int>(detectionMat.at<float>(i, 6) * frame.rows);
ss.str("");
ss << confidence;
String conf(ss.str());
Rect object(xLeftBottom, yLeftBottom, xRightTop - xLeftBottom, yRightTop - yLeftBottom);
rectangle(frame, object, Scalar(0, 0, 255));
//String label = String(classNames[objectClass]) + ": " + conf;
String label = String(classNamesZH[objectClass]) + ": " + conf;
int baseLine = 0;
Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.48, 1, &baseLine);
rectangle(frame, Rect(Point(xLeftBottom, yLeftBottom + labelSize.height / 2 - 4),
Size(labelSize.width, labelSize.height + baseLine)),
Scalar(255, 255, 255), FILLED);
//putText(frame, label, Point(xLeftBottom, yLeftBottom), FONT_HERSHEY_SIMPLEX, 0.48, Scalar(0, 0, 0));
putTextZH(frame, label.c_str(), Point(xLeftBottom, yLeftBottom), Scalar(0, 0, 255), 14, "Arial");
}
}
Mat detectionMat是输入图像后经过网络前向传播后的输出7*10的结果矩阵,其定义如下图所示:
目标为20分类,定义如下:
const char* classNames[] = { "background",
"aeroplane", "bicycle", "bird", "boat",
"bottle", "bus", "car", "cat", "chair",
"cow", "diningtable", "dog", "horse",
"motorbike", "person", "pottedplant",
"sheep", "sofa", "train", "tvmonitor" };
const char* classNamesZH[] = {
"背景","飞机", "自行车", "鸟", "船",
"瓶子", "巴士", "汽车", "猫", "椅子",
"牛", "餐桌", "狗", "马","摩托车",
"人", "盆栽","羊", "沙发", "火车", "电视" };
上图中置信概率最高(0.999)的目标数组下标为2,对应的是bicycle自行车,只要大于设置的阈值(变量confidenceThreshold),就会在图像上标记出目标的位置(detectionMat行向量的3,4,5,6元素)。比如将阈值confidenceThreshold设置为0.5,则识别结果只有一个是大于0.5的,则只会在图像上标记出自行车,如下图:
实验效果
本地图片
相机实时
在相机上(普通的USB相机)会比较卡顿···,因为用的是CPU跑的,附上本人帅照···
附件
源代码工程戳这里(注:release下的可执行程序可以直接运行)。