//This is a demo code for using a SSD model to do detection.//The code is modified from examples/cpp_classification/classification.cpp.//Usage://ssd_detect [FLAGS] model_file weights_file list_file//
//where model_file is the .prototxt file defining the network architecture, and//weights_file is the .caffemodel file containing the network parameters, and//list_file contains a list of image files with the format as follows://folder/img1.JPEG//folder/img2.JPEG//list_file can also contain a list of video files with the format as follows://folder/video1.mp4//folder/video2.mp4//#define USE_OPENCV 1#include#ifdef USE_OPENCV
#include#include#include
#endif //USE_OPENCV#include#include#include#include#include#include#include#ifdef USE_OPENCVusing namespace caffe; //NOLINT(build/namespaces)
using namespacecv;using namespacestd;classDetector {public:
Detector(const string&model_file,const string&weights_file,const string&mean_file,const string&mean_value);
std::vector > Detect(const cv::Mat&img);private:void SetMean(const string& mean_file, const string&mean_value);void WrapInputLayer(std::vector<:mat>*input_channels);void Preprocess(const cv::Mat&img,
std::vector<:mat>*input_channels);private:
boost::shared_ptr >net_;
cv::Size input_geometry_;intnum_channels_;
cv::Mat mean_;
};
Detector::Detector(const string&model_file,const string&weights_file,const string&mean_file,const string&mean_value) {
#ifdef CPU_ONLY
Caffe::set_mode(Caffe::CPU);#elseCaffe::set_mode(Caffe::GPU);#endif
/*Load the network.*/net_.reset(new Net(model_file, TEST));
net_->CopyTrainedLayersFrom(weights_file);
CHECK_EQ(net_->num_inputs(), 1) << "Network should have exactly one input.";
CHECK_EQ(net_->num_outputs(), 1) << "Network should have exactly one output.";
Blob* input_layer = net_->input_blobs()[0];
num_channels_= input_layer->channels();
CHECK(num_channels_== 3 || num_channels_ == 1)<< "Input layer should have 1 or 3 channels.";
input_geometry_= cv::Size(input_layer->width(), input_layer->height());/*Load the binaryproto mean file.*/SetMean(mean_file, mean_value);
}
std::vector > Detector::Detect(const cv::Mat&img) {
Blob* input_layer = net_->input_blobs()[0];
input_layer->Reshape(1, num_channels_,
input_geometry_.height, input_geometry_.width);/*Forward dimension change to all layers.*/net_->Reshape();
std::vector<:mat>input_channels;
WrapInputLayer(&input_channels);
Preprocess(img,&input_channels);
net_->Forward();/*Copy the output layer to a std::vector*/Blob* result_blob = net_->output_blobs()[0];const float* result = result_blob->cpu_data();const int num_det = result_blob->height();
vector >detections;for (int k = 0; k < num_det; ++k) {if (result[0] == -1) {//Skip invalid detection.
result += 7;continue;
}
vector detection(result, result + 7);
detections.push_back(detection);
result+= 7;
}returndetections;
}/*Load the mean file in binaryproto format.*/
void Detector::SetMean(const string& mean_file, const string&mean_value) {
cv::Scalar channel_mean;if (!mean_file.empty()) {
CHECK(mean_value.empty())<<
"Cannot specify mean_file and mean_value at the same time";
BlobProto blob_proto;
ReadProtoFromBinaryFileOrDie(mean_file.c_str(),&blob_proto);/*Convert from BlobProto to Blob*/Blobmean_blob;
mean_blob.FromProto(blob_proto);
CHECK_EQ(mean_blob.channels(), num_channels_)<< "Number of channels of mean file doesn't match input layer.";/*The format of the mean file is planar 32-bit float BGR or grayscale.*/std::vector<:mat>channels;float* data =mean_blob.mutable_cpu_data();for (int i = 0; i < num_channels_; ++i) {/*Extract an individual channel.*/cv::Mat channel(mean_blob.height(), mean_blob.width(), CV_32FC1, data);
channels.push_back(channel);
data+= mean_blob.height() *mean_blob.width();
}/*Merge the separate channels into a single image.*/cv::Mat mean;
cv::merge(channels, mean);/*Compute the global mean pixel value and create a mean image
* filled with this value.*/channel_mean=cv::mean(mean);
mean_=cv::Mat(input_geometry_, mean.type(), channel_mean);
}if (!mean_value.empty()) {
CHECK(mean_file.empty())<<
"Cannot specify mean_file and mean_value at the same time";
stringstream ss(mean_value);
vectorvalues;stringitem;while (getline(ss, item, ',')) {float value =std::atof(item.c_str());
values.push_back(value);
}
CHECK(values.size()== 1 || values.size() == num_channels_) <<
"Specify either 1 mean_value or as many as channels:" <
std::vector<:mat>channels;for (int i = 0; i < num_channels_; ++i) {/*Extract an individual channel.*/cv::Mat channel(input_geometry_.height, input_geometry_.width, CV_32FC1,
cv::Scalar(values[i]));
channels.push_back(channel);
}
cv::merge(channels, mean_);
}
}/*Wrap the input layer of the network in separate cv::Mat objects
* (one per channel). This way we save one memcpy operation and we
* don't need to rely on cudaMemcpy2D. The last preprocessing
* operation will write the separate channels directly to the input
* layer.*/
void Detector::WrapInputLayer(std::vector<:mat>*input_channels) {
Blob* input_layer = net_->input_blobs()[0];int width = input_layer->width();int height = input_layer->height();float* input_data = input_layer->mutable_cpu_data();for (int i = 0; i < input_layer->channels(); ++i) {
cv::Mat channel(height, width, CV_32FC1, input_data);
input_channels->push_back(channel);
input_data+= width *height;
}
}void Detector::Preprocess(const cv::Mat&img,
std::vector<:mat>*input_channels) {/*Convert the input image to the input image format of the network.*/cv::Mat sample;if (img.channels() == 3 && num_channels_ == 1)
cv::cvtColor(img, sample, cv::COLOR_BGR2GRAY);else if (img.channels() == 4 && num_channels_ == 1)
cv::cvtColor(img, sample, cv::COLOR_BGRA2GRAY);else if (img.channels() == 4 && num_channels_ == 3)
cv::cvtColor(img, sample, cv::COLOR_BGRA2BGR);else if (img.channels() == 1 && num_channels_ == 3)
cv::cvtColor(img, sample, cv::COLOR_GRAY2BGR);elsesample=img;
cv::Mat sample_resized;if (sample.size() !=input_geometry_)
cv::resize(sample, sample_resized, input_geometry_);elsesample_resized=sample;
cv::Mat sample_float;if (num_channels_ == 3)
sample_resized.convertTo(sample_float, CV_32FC3);elsesample_resized.convertTo(sample_float, CV_32FC1);
cv::Mat sample_normalized;
cv::subtract(sample_float, mean_, sample_normalized);/*This operation will write the separate BGR planes directly to the
* input layer of the network because it is wrapped by the cv::Mat
* objects in input_channels.*/cv::split(sample_normalized,*input_channels);
CHECK(reinterpret_cast(input_channels->at(0).data)== net_->input_blobs()[0]->cpu_data())<< "Input channels are not wrapping the input layer of the network.";
}
DEFINE_string(mean_file,"","The mean file used to subtract from the input image.");
DEFINE_string(mean_value,"104,117,123","If specified, can be one value or can be same as image channels"
"- would subtract from the corresponding channel). Separated by ','."
"Either mean_file or mean_value should be provided, not both.");
DEFINE_string(file_type,"image","The file type in the list_file. Currently support image and video.");
DEFINE_string(out_file,"","If provided, store the detection results in the out_file.");
DEFINE_double(confidence_threshold,0.6,"Only store detections with score higher than the threshold.");
vector labels = {"background","aeroplane", "bicycle","bird", "boat", "bottle","bus", "car", "cat","chair","cow","diningtable","dog","horse","motorbike","person","pottedplant","sheep","sofa","train","tvmonitor"};int main(int argc, char**argv) {const string& model_file = "deploy.prototxt";const string& weights_file = "/home/jiawenhao/ssd/caffe/models/VGGNet/VOC0712/SSD_300x300/VGG_VOC0712_SSD_300x300_iter_120000.caffemodel";const string& mean_file =FLAGS_mean_file;const string& mean_value = "104, 117, 123";const string& file_type = "image";const string& out_file = "a.outfile";const float confidence_threshold = 0.6;//Initialize the network.
Detector detector(model_file, weights_file, mean_file, mean_value);//Set the output mode.
std::streambuf* buf =std::cout.rdbuf();
std::ofstream outfile;if (!out_file.empty()) {
outfile.open(out_file.c_str());if(outfile.good()) {
buf=outfile.rdbuf();
}
}
std::ostreamout(buf);//Process image one by one.
std::ifstream infile("testimg.list");
std::stringfile;
std::stringimgName;int cnt = 0;while (infile >>file)
{if (file_type == "image")
{
std::cout<< file <
imgName= file.substr(pos + 1, file.size() -pos);
cv::Mat img= cv::imread(file, -1);
CHECK(!img.empty()) << "Unable to decode image" <
std::vector > detections =detector.Detect(img);/*Print the detection results.*/
for (int i = 0; i < detections.size(); ++i) {const vector& d =detections[i];//Detection format: [image_id, label, score, xmin, ymin, xmax, ymax].
CHECK_EQ(d.size(), 7);const float score = d[2];if (score >=confidence_threshold) {out << file << " ";out << static_cast(d[1]) << " ";out << score << " ";out << static_cast(d[3] * img.cols) << " ";out << static_cast(d[4] * img.rows) << " ";out << static_cast(d[5] * img.cols) << " ";out << static_cast(d[6] * img.rows) <<:endl x="static_cast<int">(d[3] *img.cols);int y = static_cast(d[4] *img.rows);int width = static_cast(d[5] * img.cols) -x;int height = static_cast(d[6] * img.rows) -y;
Rect rect(max(x,0), max(y,0), width, height);
rectangle(img, rect, Scalar(0,255,0));string sco = to_string(score).substr(0, 5);
putText(img, labels[static_cast(d[1])] + ":" + sco, Point(max(x, 0), max(y + height / 2, 0)),
FONT_HERSHEY_SIMPLEX,1, Scalar(0,255,0));
imwrite("result/" +imgName, img);
}
}
}else if (file_type == "video") {
cv::VideoCapture cap(file);if (!cap.isOpened()) {
LOG(FATAL)<< "Failed to open video:" <
}
cv::Mat img;int frame_count = 0;while (true) {bool success =cap.read(img);if (!success) {
LOG(INFO)<< "Process" << frame_count << "frames from" <
}
CHECK(!img.empty()) << "Error when read frame";
std::vector > detections =detector.Detect(img);/*Print the detection results.*/
for (int i = 0; i < detections.size(); ++i) {const vector& d =detections[i];//Detection format: [image_id, label, score, xmin, ymin, xmax, ymax].
CHECK_EQ(d.size(), 7);const float score = d[2];if (score >=confidence_threshold) {out << file << "_";out << std::setfill('0') << std::setw(6) << frame_count << " ";out << static_cast(d[1]) << " ";out << score << " ";out << static_cast(d[3] * img.cols) << " ";out << static_cast(d[4] * img.rows) << " ";out << static_cast(d[5] * img.cols) << " ";out << static_cast(d[6] * img.rows) <<:endl>
}
}++frame_count;
}if(cap.isOpened()) {
cap.release();
}
}else{
LOG(FATAL)<< "Unknown file_type:" <
}
}return 0;
}#else
int main(int argc, char**argv) {
LOG(FATAL)<< "This example requires OpenCV; compile with USE_OPENCV.";
}#endif //USE_OPENCV