Face Alignment by Explicit Shape Regression（ESR）源码解析

本文链接：https://blog.csdn.net/leo_812/article/details/52399742

最近在研究人脸对齐，Joint Cascade Face Detection and Alignment（ECCV14）这篇文章感觉不错，将对齐和人脸检测同时做了，而且速度非常快，精度也很高。不过菜鸟一下子看不懂，所以就翻了一下之前的文章，发现这些算法都是一点一点进化过来的。之前作者发表过Face Alignment at 3000 FPS via Regressing Local Binary Features,再之前CVPR2012的时候他们在Face Alignment by Explicit Shape Regression中提出了shape index feature，这个特征在3000fps中也有使用。
先简单的介绍一下论文，之后会给出注释过的源码，只注释了训练部分，因为这个代码有一段时间了，看一下思路学习一下就好。新的方法实在太多，而且性能更优越，这里只是为后续工作打基础。

论文简介

Face Alignment by Shape Regression

作者使用了多级回归的方式得到特征点，在3000fps中也用到了多级回归。一共训练了10个强分类器，每个强分类器输出一个shape的更新参数，shape更新之后再重新生成新的特征，训练并进行一下次迭代。这10个强分类器每一个都是由500个蕨分类器组成，有点像随机森林。

Shape-indexed (image) features

在介绍蕨分类器之前要先介绍一下Shape-indexed features，蕨分类器和回归树不同的地方在于，回归树是根据最小均方差来产生特征。而蕨分类器是自己生成特征。这里用到的特征就是Shape-indexed features ，论文中提到了计算方法。具体解释会在源码中给出说明。
1. Project the regression target(vectorial delta shape) to a random direction to produce a scalar.
2. Among P 2 features, select a feature with highest cor- relation to the scalar.
3. Repeat steps 1. and 2. F times to obtain F features.
4. Construct a fern by F features with random thresholds.

Random Ferns

随机蕨算法比较早的应用是在TLD算法中，然后被使用在ESR这篇论文中。下面我找了几篇比较好的随机厥参考文献，感觉第一篇说的最简洁清晰，不过他说道的是分类问题，对于ESR中的回归问题其实也一样，统计落入一个分桶中的所有差值做平均，就可以得到回归值。linux下没有截图工具，之后再加一些图说明一下。
单个厥分离器的准确率较低，所以通过类似随机森林的方法多次取特征，多次采样，对结果做voting或者average就可以得到较好的结果。

参考文献：
http://blog.csdn.net/huangynn/article/details/51730076
http://blog.sciencenet.cn/blog-465130-964430.html
http://blog.csdn.net/stayfoolish_fan/article/details/50506906
http://blog.csdn.net/stayfoolish_fan/article/details/50455359

工程以及训练样本我之后会放在github上，对原论文做了些修改去掉了论文里提到的similarity transform，可能精度会有一定下降，不过肉眼看不出，代码会清晰一些。

TrainDemo.cpp

函数入口，设置了一些基本的参数，加载数据，然后开始训练。

#include "FaceAlignment.h"
using namespace std;
using namespace cv;

int main(){

    char s[10];
    sprintf(s,"%.4d",1);
    string ss(s);
    cout<<ss<<endl;


    int img_num = 130;     //1340
    int candidate_pixel_num = 400;
    int fern_pixel_num = 5;
    int first_level_num = 10;
    int second_level_num = 500; 
    int landmark_num = 29;
    int initial_number = 20;
    vector<Mat_<uchar> > images;
    vector<BoundingBox> bbox;

    //加载训练图片
    cout<<"Read images..."<<endl;
    for(int i = 0;i < img_num;i++){
        string image_name = "/home/f/FaceAlignment-master/FaceAlignment/COFW_Dataset/trainingImages/";
        image_name = image_name + to_string(i+1) + ".jpg";
        Mat_<uchar> temp = imread(image_name,0);
        images.push_back(temp);
    }

    // 读取bounding_box.  x,y,width,height,center_x,center_y
    vector<BoundingBox> bounding_box;
    ifstream fin;
    fin.open("/home/f/FaceAlignment-master/FaceAlignment/COFW_Dataset/boundingbox.txt");
    for(int i = 0;i < img_num;i++){
        BoundingBox temp;
        fin>>temp.start_x>>temp.start_y>>temp.width>>temp.height;
        temp.centroid_x = temp.start_x + temp.width/2.0;
        temp.centroid_y = temp.start_y + temp.height/2.0;
        bounding_box.push_back(temp);
    }
    fin.close();

    // 读取特征点坐标
    vector<Mat_<double> > ground_truth_shapes;
    fin.open("/home/f/FaceAlignment-master/FaceAlignment/COFW_Dataset/keypoints.txt");
    for(int i = 0;i < img_num;i++){
        Mat_<double> temp(landmark_num,2);
        for(int j = 0;j < landmark_num;j++){
            fin>>temp(j,0); 
        }
        for(int j = 0;j < landmark_num;j++){
            fin>>temp(j,1); 
        }
        ground_truth_shapes.push_back(temp);
    }
    fin.close(); 
    //训练模型
    ShapeRegressor regressor;
    regressor.Train(images,ground_truth_shapes,bounding_box,first_level_num,second_level_num,
                    candidate_pixel_num,fern_pixel_num,initial_number);
    regressor.Save("/home/f/FaceAlignment-master/FaceAlignment/model.txt");
    return 0;
}

ShapeRegressor.cpp

这里也没有到真正的训练阶段，主要是预处理。

void ShapeRegressor::Train(const vector<Mat_<uchar> >& images,          //gray scale images
                   const vector<Mat_<double> >& ground_truth_shapes,    // a vector of N*2 matrix, where N is the number of landmarks
                   const vector<BoundingBox>& bounding_box,             // BoundingBox of faces
                   int first_level_num, int second_level_num,           // 10  500
                   int candidate_pixel_num, int fern_pixel_num,         // 400 5
                   int initial_num){                                    // 20 number of initial shapes for each input image
    cout<<"Start training..."<<endl;
    bounding_box_ = bounding_box;
    training_shapes_ = ground_truth_shapes;
    first_level_num_ = first_level_num;
    landmark_num_ = ground_truth_shapes[0].rows; 
    // data augmentation and multiple initialization 
    vector<Mat_<uchar> > augmented_images;
    vector<BoundingBox> augmented_bounding_box;
    vector<Mat_<double> > augmented_ground_truth_shapes;
    vector<Mat_<double> > current_shapes;       // 扩大之后的初始化shape，绝对坐标

    // 扩大训练数据
    RNG random_generator(getTickCount());
    for(int i = 0;i < images.size();i++){
        for(int j = 0;j < initial_num;j++){ //为每一幅图片产生initial_num个初始化shape
            int index = 0;
            do{
                index = random_generator.uniform(0, images.size());
            }while(index == i);
            augmented_images.push_back(images[i]);
            augmented_ground_truth_shapes.push_back(ground_truth_shapes[i]);
            augmented_bounding_box.push_back(bounding_box[i]); 
            // 1. Select ground truth shapes of other images as initial shapes
            // 2. Project current shape to bounding box of ground truth shapes
            // 绝对坐标点，包含了人脸位置信息，这里先转换为相对坐标，再通过bounding_box还原，去除绝对坐标的偏差
            Mat_<double> temp = ground_truth_shapes[index];
            temp = ProjectShape(temp, bounding_box[index]);
            temp = ReProjectShape(temp, bounding_box[i]);
            current_shapes.push_back(temp); 
        } 
    }

    // 求平均shape模型，结果保存为相对坐标
    // get mean shape from training shapes
    mean_shape_ = GetMeanShape(ground_truth_shapes,bounding_box); 

    // train fern cascades
    fern_cascades_.resize(first_level_num);
    vector<Mat_<double> > prediction;
    for(int i = 0;i < first_level_num;i++){
        cout<<"Training fern cascades: "<<i+1<<" out of "<<first_level_num<<endl;
        prediction = fern_cascades_[i].Train(augmented_images,current_shapes,
                augmented_ground_truth_shapes,augmented_bounding_box,mean_shape_,second_level_num,candidate_pixel_num,fern_pixel_num, i+1, first_level_num);

        // update current shapes
        // 对每一副图片的形状进行更新,prediction[x] 中保存了n个特征点的位移更新量
        for(int j = 0;j < prediction.size();j++){
            current_shapes[j] = prediction[j] + ProjectShape(current_shapes[j], augmented_bounding_box[j]);
            current_shapes[j] = ReProjectShape(current_shapes[j],augmented_bounding_box[j]);
        }
    } 

}

FernCascade.cpp

这里开始进入正题，为每一幅图片生成400个特征点，然后放入到fern分类器中训练。生成特征点的部分稍微解释一下，从代码可以看出，就是先随机的生成400个随机点，这些点都在bounding_box里，然后计算每一个点距离shape特征点（29个point）的距离，找出距离最近的shape特征点的索引，那么这个特征点就是针对某一个shape特征点的特征。（特征有点乱，人脸特征点，我都在前面加了shape，特指人脸上的点）

vector<Mat_<double> > FernCascade::Train(const vector<Mat_<uchar> >& images,
                                    const vector<Mat_<double> >& current_shapes,
                                    const vector<Mat_<double> >& ground_truth_shapes,
                                    const vector<BoundingBox> & bounding_box,
                                    const Mat_<double>& mean_shape,
                                    int second_level_num,               //500
                                    int candidate_pixel_num,            //400
                                    int fern_pixel_num,                 //5
                                    int curr_level_num,
                                    int first_level_num){               //10
    Mat_<double> candidate_pixel_locations(candidate_pixel_num,2);      //特征点位置坐标（相对于mean_shape，75行）
    Mat_<int> nearest_landmark_index(candidate_pixel_num,1);            //特征点最近shape点的索引
    vector<Mat_<double> > regression_targets;                           //存放残差
    RNG random_generator(getTickCount());
    second_level_num_ = second_level_num;

    // calculate regression targets: the difference between ground truth shapes and current shapes
    // candidate_pixel_locations: the locations of candidate pixels, indexed relative to its nearest landmark on mean shape
    // 计算残差
    regression_targets.resize(current_shapes.size()); 
    for(int i = 0;i < current_shapes.size();i++){
        regression_targets[i] = ProjectShape(ground_truth_shapes[i],bounding_box[i]) 
                                - ProjectShape(current_shapes[i],bounding_box[i]);
    }

    // 生成 shape-indexed features 特征点
    // 在整张脸中生成400个随机点,并且找到和这400个随机点最近的shape点的索引
    for(int i = 0;i < candidate_pixel_num;i++){
        double x = random_generator.uniform(-1.0,1.0);
        double y = random_generator.uniform(-1.0,1.0);
        if(x*x + y*y > 1.0){    //x,y的值代表的相对坐标,这取值范围涵盖了整个boundingbox上的所有点
            i--;
            continue;
        }
        // find nearest landmark index
        double min_dist = 1e10;
        int min_index = 0;
        for(int j = 0;j < mean_shape.rows;j++){
            double temp = pow(mean_shape(j,0)-x,2.0) + pow(mean_shape(j,1)-y,2.0);
            if(temp < min_dist){
                min_dist = temp;
                min_index = j;
            }
        }
        candidate_pixel_locations(i,0) = x - mean_shape(min_index,0);       //
        candidate_pixel_locations(i,1) = y - mean_shape(min_index,1);
        nearest_landmark_index(i) = min_index;   
    }

    // for densities: each row is the pixel densities at each candidate pixels for an image
    // 求每幅图的400个特征点的特征值
    vector<vector<double> > densities;
    densities.resize(candidate_pixel_num);
    for(int i = 0;i < images.size();i++){
        Mat_<double> temp = ProjectShape(current_shapes[i],bounding_box[i]);
        for(int j = 0;j < candidate_pixel_num;j++){
            //这里不确定,应该是特征点相对于shape点的绝对坐标，但是计算方法有点奇怪，修改之后会偏移较大
            double project_x = (candidate_pixel_locations(j,0) + candidate_pixel_locations(j,1))*bounding_box[i].width/2.0;
            double project_y = (candidate_pixel_locations(j,0) + candidate_pixel_locations(j,1))*bounding_box[i].height/2.0;
            int index = nearest_landmark_index(j);
            int real_x = project_x + current_shapes[i](index,0);
            int real_y = project_y + current_shapes[i](index,1);
            //不能越界
            real_x = std::max(0.0,std::min((double)real_x,images[i].cols-1.0));
            real_y = std::max(0.0,std::min((double)real_y,images[i].rows-1.0));
            densities[j].push_back((int)images[i](real_y,real_x));  //j索引的是400个特征点的值,i索引的是所有图片
        }
    }

    //求 densities 的协方差   densities里面存储了所有训练图片的特征点
    // calculate the covariance between densities at each candidate pixels 
    Mat_<double> covariance(candidate_pixel_num,candidate_pixel_num);
    Mat_<double> mean;
    for(int i = 0;i < candidate_pixel_num;i++){
        for(int j = i;j< candidate_pixel_num;j++){
            double correlation_result = calculate_covariance(densities[i],densities[j]);
            covariance(i,j) = correlation_result;
            covariance(j,i) = correlation_result;
        }
    } 

    // train ferns
    // 训练蕨分类器，每个蕨分类器的输出对n个shape点的坐标做修正
    vector<Mat_<double> > prediction;
    prediction.resize(regression_targets.size());
    for(int i = 0;i < regression_targets.size();i++){
        prediction[i] = Mat::zeros(mean_shape.rows,2,CV_64FC1); 
    } 
    ferns_.resize(second_level_num);
    clock_t t = clock();
    for(int i = 0;i < second_level_num;i++){
        vector<Mat_<double> > temp = ferns_[i].Train(densities,covariance,candidate_pixel_locations,nearest_landmark_index,regression_targets,fern_pixel_num);     
        // update regression targets
        for(int j = 0;j < temp.size();j++){
            prediction[j] = prediction[j] + temp[j];
            //boost？ 每次都根据残差修改训练参数，但是没有像adaboost修改样本权重
            regression_targets[j] = regression_targets[j] - temp[j];
        }  
        //打印训练时间
        if((i+1) % 50 == 0){
            cout<<"Fern cascades: "<< curr_level_num << " out of "<< first_level_num<<"; "; 
            cout<<"Ferns: "<<i+1<<" out of "<<second_level_num<<endl;
            double remaining_level_num= (first_level_num - curr_level_num) * 500 + second_level_num - i; 
            double time_remaining = 0.02 * double(clock() - t)  / CLOCKS_PER_SEC * remaining_level_num;
            cout<<"Expected remaining time: "
                << (int)time_remaining / 60<<"min "<<(int)time_remaining % 60 <<"s"<<endl; 
            t = clock();
        }
    }
    return prediction;    
}

Fern.cpp

通过计算协方差求出5对特征点，根据特征点的差值产生一个2^5的分桶，统计每个分桶中落入的shape和ground_truth shape差值，求平均之后就是某个分桶中的shape的更新值。

vector<Mat_<double> > Fern::Train(const vector<vector<double> >& candidate_pixel_intensity,     //特征点的特征值
                                  const Mat_<double>& covariance,                               //特征点之间的协方差（找出最有辨别力的特征点）
                                  const Mat_<double>& candidate_pixel_locations,                //特征点的坐标（相对坐标）
                                  const Mat_<int>& nearest_landmark_index,                      //特征点索引
                                  const vector<Mat_<double> >& regression_targets,              //残差
                                  int fern_pixel_num){                                          //有效特征对的数量5

    fern_pixel_num_ = fern_pixel_num;
    landmark_num_ = regression_targets[0].rows;
    selected_pixel_index_.create(fern_pixel_num,2);     //the index of selected pixels pairs in fern
    selected_pixel_locations_.create(fern_pixel_num,4); //the locations of selected pixel pairs stored in the format (x_1,y_1,x_2,y_2) for each row
    selected_nearest_landmark_index_.create(fern_pixel_num,2);
    int candidate_pixel_num = candidate_pixel_locations.rows;

    // select pixel pairs from candidate pixels, this selection is based on the correlation between pixel 
    // densities and regression targets
    // for details, please refer to "Face Alignment by Explicit Shape Regression" 
    // threshold_: thresholds for each pair of pixels in fern 

    threshold_.create(fern_pixel_num,1);
    // get a random direction
    RNG random_generator(getTickCount());
    for(int i = 0;i < fern_pixel_num;i++){
        Mat_<double> random_direction(landmark_num_ ,2);
        random_generator.fill(random_direction,RNG::UNIFORM,-1.1,1.1);
        normalize(random_direction,random_direction);
        vector<double> projection_result(regression_targets.size(), 0); 
        // project regression targets along the random direction
        // 将regression targets 向随机方向投影
        for(int j = 0;j < regression_targets.size();j++){
            double temp = 0;
            temp = sum(regression_targets[j].mul(random_direction))[0]; 
            projection_result[j] = temp;    //随机方向的投影
        } 
        Mat_<double> covariance_projection_density(candidate_pixel_num,1);
        // 求随机方向投影和特征点的协方差
        for(int j = 0;j < candidate_pixel_num;j++){
            covariance_projection_density(j) = calculate_covariance(projection_result,candidate_pixel_intensity[j]);
        }
        // find max correlation
        // 找到方差最大的特征点
        double max_correlation = -1;
        int max_pixel_index_1 = 0;
        int max_pixel_index_2 = 0;
        for(int j = 0;j < candidate_pixel_num;j++){
            for(int k = 0;k < candidate_pixel_num;k++){
                double temp1 = covariance(j,j) + covariance(k,k) - 2*covariance(j,k);
                if(abs(temp1) < 1e-10){
                    continue;
                }
                bool flag = false;
                //???
                for(int p = 0;p < i;p++){
                    if(j == selected_pixel_index_(p,0) && k == selected_pixel_index_(p,1)){
                        flag = true;
                        break; 
                    }else if(j == selected_pixel_index_(p,1) && k == selected_pixel_index_(p,0)){
                        flag = true;
                        break;
                    } 
                }
                if(flag){
                    continue;
                } 
                double temp = (covariance_projection_density(j) - covariance_projection_density(k))
                    / sqrt(temp1);
                if(abs(temp) > max_correlation){
                    max_correlation = temp;
                    max_pixel_index_1 = j;
                    max_pixel_index_2 = k;
                }
            }
        }

        selected_pixel_index_(i,0) = max_pixel_index_1; 
        selected_pixel_index_(i,1) = max_pixel_index_2; 
        selected_pixel_locations_(i,0) = candidate_pixel_locations(max_pixel_index_1,0);
        selected_pixel_locations_(i,1) = candidate_pixel_locations(max_pixel_index_1,1);
        selected_pixel_locations_(i,2) = candidate_pixel_locations(max_pixel_index_2,0);
        selected_pixel_locations_(i,3) = candidate_pixel_locations(max_pixel_index_2,1);
        selected_nearest_landmark_index_(i,0) = nearest_landmark_index(max_pixel_index_1); 
        selected_nearest_landmark_index_(i,1) = nearest_landmark_index(max_pixel_index_2); 

        // get threshold for this pair
        double max_diff = -1;
        for(int j = 0;j < candidate_pixel_intensity[max_pixel_index_1].size();j++){
            double temp = candidate_pixel_intensity[max_pixel_index_1][j] - candidate_pixel_intensity[max_pixel_index_2][j];
            if(abs(temp) > max_diff){
                max_diff = abs(temp);
            }
        }
        threshold_(i) = random_generator.uniform(-0.2*max_diff,0.2*max_diff); 
    } 

    // determine the bins of each shape
    // 5个bit的分桶，统计落入每一个分桶的索引
    vector<vector<int> > shapes_in_bin;
    int bin_num = pow(2.0,fern_pixel_num);
    shapes_in_bin.resize(bin_num);
    for(int i = 0;i < regression_targets.size();i++){
        int index = 0;
        for(int j = 0;j < fern_pixel_num;j++){
            double density_1 = candidate_pixel_intensity[selected_pixel_index_(j,0)][i];
            double density_2 = candidate_pixel_intensity[selected_pixel_index_(j,1)][i];
            if(density_1 - density_2 >= threshold_(j)){
                index = index + pow(2.0,j);
            } 
        }
        shapes_in_bin[index].push_back(i);
    }

    // get bin output
    vector<Mat_<double> > prediction;
    prediction.resize(regression_targets.size());
    bin_output_.resize(bin_num);
    for(int i = 0;i < bin_num;i++){ //针对每一个分桶计算prediction[i]
        Mat_<double> temp = Mat::zeros(landmark_num_,2, CV_64FC1);
        int bin_size = shapes_in_bin[i].size();
        //求总的差值
        for(int j = 0;j < bin_size;j++){
            int index = shapes_in_bin[i][j];
            temp = temp + regression_targets[index]; 
        }
        if(bin_size == 0){
            bin_output_[i] = temp;
            continue; 
        }
        // 正则化，防止过拟合
        temp = (1.0/((1.0+1000.0/bin_size) * bin_size)) * temp;
        bin_output_[i] = temp;
        // 对每一个落入分桶中的shape的位移量进行更新
        for(int j = 0;j < bin_size;j++){
            int index = shapes_in_bin[i][j];
            prediction[index] = temp;
        }
    }
    return prediction;
}