这是别人实现的,是我移植到cc的cpu实现,可以实现caffe中使用yolo3,但是我感觉实际效果不如darknet 好点
template <typename Dtype>
inline Dtype sigmoid(Dtype x)
{
return 1. / (1. + exp(-x));
}
template <typename Dtype>
class PredictionResult {
public:
Dtype x;
Dtype y;
Dtype w;
Dtype h;
Dtype objScore;
Dtype classScore;
Dtype confidence;
int classType;
};
template <typename Dtype>
Dtype overlap(Dtype x1, Dtype w1, Dtype x2, Dtype w2)
{
float l1 = x1 - w1 / 2;
float l2 = x2 - w2 / 2;
float left = l1 > l2 ? l1 : l2;
float r1 = x1 + w1 / 2;
float r2 = x2 + w2 / 2;
float right = r1 < r2 ? r1 : r2;
return right - left;
}
template <typename Dtype>
Dtype box_intersection(vector<Dtype> a, vector<Dtype> b)
{
float w = overlap(a[0], a[2], b[0], b[2]);
float h = overlap(a[1], a[3], b[1], b[3]);
if (w < 0 || h < 0) return 0;
float area = w * h;
return area;
}
template <typename Dtype>
Dtype box_union(vector<Dtype> a, vector<Dtype> b)
{
float i = box_intersection(a, b);
float u = a[2] * a[3] + b[2] * b[3] - i;
return u;
}
template <typename Dtype>
Dtype box_iou(vector<Dtype> a, vector<Dtype> b)
{
return box_intersection(a, b) / box_union(a, b);
}
struct NormalizedBBox {
float xmin = 1;
float ymin = 2;
float xmax = 3;
float ymax = 4;
int label = 5;
bool difficult = 6;
float score = 7;
float size = 8;
};
bool BoxSortDecendScore(const PredictionResult<float>& box1, const PredictionResult<float>& box2) {
return box1.confidence > box2.confidence;
};
template <typename Dtype>
void setNormalizedBBox(NormalizedBBox& bbox, Dtype x, Dtype y, Dtype w, Dtype h)
{
Dtype xmin = x - w / 2.0;
Dtype xmax = x + w / 2.0;
Dtype ymin = y - h / 2.0;
Dtype ymax = y + h / 2.0;
if (xmin < 0.0) {
xmin = 0.0;
}
if (xmax > 1.0) {
xmax = 1.0;
}
if (ymin < 0.0) {
ymin = 0.0;
}
if (ymax > 1.0) {
ymax = 1.0;
}
bbox.set_xmin(xmin);
bbox.set_ymin(ymin);
bbox.set_xmax(xmax);
bbox.set_ymax(ymax);
float bbox_size = BBoxSize(bbox, true);
bbox.set_size(bbox_size);
}
template <typename Dtype>
void ApplyNms(vector<PredictionResult<Dtype>>& boxes, vector<int>& idxes, Dtype threshold) {
map<int, int> idx_map;
for (int i = 0; i < boxes.size() - 1; ++i) {
if (idx_map.find(i) != idx_map.end()) {
continue;
}
for (int j = i + 1; j < boxes.size(); ++j) {
if (idx_map.find(j) != idx_map.end()) {
continue;
}
vector<Dtype> Bbox1, Bbox2;
Bbox1.push_back(boxes[i].x);
Bbox1.push_back(boxes[i].y);
Bbox1.push_back(boxes[i].w);
Bbox1.push_back(boxes[i].h);
Bbox2.push_back(boxes[j].x);
Bbox2.push_back(boxes[j].y);
Bbox2.push_back(boxes[j].w);
Bbox2.push_back(boxes[j].h);
Dtype iou = box_iou(Bbox1, Bbox2);
if (iou >= threshold) {
idx_map[j] = 1;
}
/* NormalizedBBox Bbox1, Bbox2;
setNormalizedBBox(Bbox1, boxes[i].x, boxes[i].y, boxes[i].w, boxes[i].h);
setNormalizedBBox(Bbox2, boxes[j].x, boxes[j].y, boxes[j].w, boxes[j].h);
float overlap = JaccardOverlap(Bbox1, Bbox2, true);
if (overlap >= threshold) {
idx_map[j] = 1;
}*/
}
}
for (int i = 0; i < boxes.size(); ++i) {
if (idx_map.find(i) == idx_map.end()) {
idxes.push_back(i);
}
}
}
template <typename Dtype>
void class_index_and_score(Dtype* input, int classes, PredictionResult<Dtype>& predict)
{
Dtype sum = 0;
Dtype large = input[0];
int classIndex = 0;
for (int i = 0; i < classes; ++i) {
if (input[i] > large)
large = input[i];
}
for (int i = 0; i < classes; ++i) {
Dtype e = exp(input[i] - large);
sum += e;
input[i] = e;
}
for (int i = 0; i < classes; ++i) {
input[i] = input[i] / sum;
}
large = input[0];
classIndex = 0;
for (int i = 0; i < classes; ++i) {
if (input[i] > large) {
large = input[i];
classIndex = i;
}
}
predict.classType = classIndex;
predict.classScore = large;
}
template <typename Dtype>
void get_region_box2(vector<Dtype> &b, Dtype* x, vector<Dtype> biases, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride) {
//LOG(INFO) << lw << "," << lh << "," << w << "," << h << "," << stride;
b.clear();
b.push_back((i + (x[index + 0 * stride])) / lw);
b.push_back((j + (x[index + 1 * stride])) / lh);
b.push_back(exp(x[index + 2 * stride]) * biases[2 * n] / (w));
b.push_back(exp(x[index + 3 * stride]) * biases[2 * n + 1] / (h));
}
class Yolov3DetectionOutput : public AbstractCustomLayer {
public:
SETUP_LAYERFUNC(Yolov3DetectionOutput);
virtual void setup(const char* name, const char* type, const char* param_str, int phase, Blob** bottom, int numBottom, Blob** top, int numTop) {
//
//CHECK(yolov3_detection_output_param.has_num_classes()) << "Must specify num_classes";
side_ = bottom[0]->width();
num_class_ = 20;
num_ = 2;
coords_ = 4;
confidence_threshold_ = .01;
nms_threshold_ = .45;
mask_group_num_ = 3;
biases_ = { 10,13,16,30,33,23,30,61,62,45,59,119,116,90,156,198,373,326 };
/*for (int c = 0; c < 18; ++c) {
biases_.push_back(biases[c]);
}*/
mask_ = { 6,7,8,3,4,5,0,1,2 };
/*for (int c = 0; c < yolov3_detection_output_param.mask_size(); ++c) {
mask_.push_back(yolov3_detection_output_param.mask(c));
}*/
anchors_scale_ = { 32,16,16 };
/*for (int c = 0; c < yolov3_detection_output_param.anchors_scale_size(); ++c) {
anchors_scale_.push_back(yolov3_detection_output_param.anchors_scale(c));
}*/
groups_num_ = 9 / mask_group_num_;
}
virtual void forward(Blob** bottom, int numBottom, Blob** top, int numTop) {
const int num = bottom[0]->num();
int len = 4 + num_class_ + 1;
int stride = side_ * side_;
int mask_offset = 0;
vector<PredictionResult<float>> predicts;
predicts.clear();
int *class_score = new int[num_class_];
for (int t = 0; t < 3; t++) {
side_ = bottom[t]->width();
int stride = side_ * side_;
swap_ = newBlobByShape(bottom[t]->num(), bottom[t]->channel(), bottom[t]->height(), bottom[t]->width());
swap_->ReshapeLike(*bottom[t]);
float* swap_data = swap_->mutable_cpu_data();
const float* input_data = bottom[t]->cpu_data();
for (int b = 0; b < bottom[t]->num(); b++) {
for (int s = 0; s < side_*side_; s++) {
//LOG(INFO) << s;
for (int n = 0; n < num_; n++) {
//LOG(INFO) << bottom[t]->count(1);
int index = n * len*stride + s + b * bottom[t]->count(1);
vector<float> pred;
for (int c = 0; c < len; ++c) {
int index2 = c * stride + index;
//LOG(INFO)<<index2;
if (c == 2 || c == 3) {
swap_data[index2] = (input_data[index2 + 0]);
}
else {
if (c > 4) {
//LOG(INFO) << c - 5;
class_score[c - 5] = sigmoid(input_data[index2 + 0]);
}
else {
swap_data[index2] = sigmoid(input_data[index2 + 0]);
}
}
}
int y2 = s / side_;
int x2 = s % side_;
//LOG(INFO) << x2 << "," << y2;
float obj_score = swap_data[index + 4 * stride];
//LOG(INFO) << obj_score;
get_region_box2(pred, swap_data, biases_, mask_[n + mask_offset], index, x2, y2, side_, side_, side_*anchors_scale_[t], side_*anchors_scale_[t], stride);
//LOG(INFO)<<anchors_scale_[t];
//LOG(INFO) << pred[0] << "," << pred[1];
//float maxmima_score = 0;
PredictionResult<float> predict;
for (int c = 0; c < num_class_; ++c) {
class_score[c] *= obj_score;
//LOG(INFO) << class_score[c];
if (class_score[c] > confidence_threshold_)
{
//if(class_score[c]>maxmima_score)
{
//maxmima_score = class_score[c];
predict.x = pred[0];
predict.y = pred[1];
predict.w = pred[2];
predict.h = pred[3];
predict.classType = c;
predict.confidence = class_score[c];
predicts.push_back(predict);
}
//LOG(INFO) << predict.x << "," << predict.y << "," << predict.w << "," << predict.h;
//LOG(INFO) << predict.confidence;
}
}
//if(maxmima_score> confidence_threshold_)
//{
// predicts.push_back(predict);
//}
}
}
}
mask_offset += groups_num_;
}
delete[] class_score;
sort(predicts.begin(), predicts.end(), BoxSortDecendScore);
vector<int> idxes;
int num_kept = 0;
if (predicts.size() > 0) {
//LOG(INFO) << predicts.size();
ApplyNms(predicts, idxes, nms_threshold_);
num_kept = idxes.size();
//LOG(INFO) << num_kept;
}
vector<int> top_shape(2, 1);
top_shape.push_back(num_kept);
top_shape.push_back(7);
float* top_data;
if (num_kept == 0) {
//DLOG(INFO) << "Couldn't find any detections";
top_shape[2] = swap_->num();
top[0]->Reshape(top_shape[0], top_shape[1], top_shape[2], top_shape[3]);//不知道是否可行
top_data = top[0]->mutable_cpu_data();
caffe_set(top[0]->count(), float(-1), top_data);
// Generate fake results per image.
for (int i = 0; i < num; ++i) {
top_data[0] = i;
top_data += 7;
}
}
else {
top[0]->Reshape(top_shape[0], top_shape[1], top_shape[2], top_shape[3]);//不知道是否可行
top_data = top[0]->mutable_cpu_data();
for (int i = 0; i < num_kept; i++) {
top_data[i * 7] = 0; //Image_Id
top_data[i * 7 + 1] = predicts[idxes[i]].classType + 1; //label
top_data[i * 7 + 2] = predicts[idxes[i]].confidence; //confidence
float left = (predicts[idxes[i]].x - predicts[idxes[i]].w / 2.);
float right = (predicts[idxes[i]].x + predicts[idxes[i]].w / 2.);
float top = (predicts[idxes[i]].y - predicts[idxes[i]].h / 2.);
float bot = (predicts[idxes[i]].y + predicts[idxes[i]].h / 2.);
top_data[i * 7 + 3] = left;
top_data[i * 7 + 4] = top;
top_data[i * 7 + 5] = right;
top_data[i * 7 + 6] = bot;
std::cout << "Detection box" << "," << predicts[idxes[i]].classType << "," << predicts[idxes[i]].x << "," << predicts[idxes[i]].y << "," << predicts[idxes[i]].w << "," << predicts[idxes[i]].h;
}
}
}
virtual void backward(Blob** bottom, int numBottom, Blob** top, int numTop, const bool* propagate_down) {
};
virtual void reshape(Blob** bottom, int numBottom, Blob** top, int numTop) {
//CHECK_EQ(bottom[0]->num(), 1);
// num() and channels() are 1.
//vector<int> top_shape(2, 1);
// Since the number of bboxes to be kept is unknown before nms, we manually
// set it to (fake) 1.
//top_shape.push_back(1);
// Each row is a 7 dimension vector, which stores
// [image_id, label, confidence, x, y, w, h]
//top_shape.push_back(7);
top[0]->Reshape(2, 1, 1, 7);
};
private:
int side_;
int num_class_;
int num_;
int coords_;
int mask_group_num_;
int groups_num_;
float confidence_threshold_;
float nms_threshold_;
vector<float> biases_;
vector<float> anchors_scale_;
vector<float> mask_;
Blob *swap_;
};```