//单二维高斯分布属性
struct Cluster{
cv::Point2d mean_point; //均值
cv::Mat cov; //协方差矩阵
std::vector<int> member_pidx;
double ratio;
};
//计算高斯的概率密度函数
double Gaussian1D(double& x, double& mean_val, double sigma){
if (std::fabs(sigma) < 1e-4){
return 1.0;
}
double exponent = -0.5 * pow((x - mean_val) / sigma, 2);
double probability = exp(exponent) / sqrt(2 * M_PI * sigma);
return probability;
}
//计算高斯的概率密度函数,二维的返回值为(0~正无穷),如果x或者y的标准差为0,表示x一样或者y一样,退化为一维分布
double Gaussian2D(cv::Point2d& point, cv::Point2d& mean_point, cv::Mat& cov){
const double ZERO_T = 1e-4;
double p = 0;
if (std::fabs(cov.at<double>(0, 0)) < ZERO_T){
if (std::fabs(cov.at<double>(1, 1)) < ZERO_T){
return 1.0;
}
p = Gaussian1D(point.y, mean_point.y, sqrt(cov.at<double>(1, 1)));
}
else if (std::fabs(cov.at<double>(1, 1)) < ZERO_T){
p = Gaussian1D(point.x, mean_point.x, sqrt(cov.at<double>(0, 0)));
}
else{
auto tp = point - mean_point;
cv::Mat mtp = (cv::Mat_<double>(2, 1) << tp.x, tp.y);
cv::Mat pp = -0.5 * (mtp.t() * cov.inv() * mtp);
p = 1.0 / (2 * M_PI * sqrt(cv::determinant(cov))) * exp(pp.at<double>(0));
}
return p;
}
//根据data统计该二维高斯分布
void Gaussian2DFit(std::vector<cv::Point2d>& data, Cluster& clu){
clu.mean_point = cv::Point2d(0, 0);
for (std::size_t i = 0; i < data.size(); i++){
clu.mean_point += data[i];
}
clu.mean_point /= (float)(data.size());
clu.cov = (cv::Mat_<double>(2, 2) << 0, 0, 0, 0);
for (std::size_t i = 0; i < data.size(); i++){
auto tp = data[i] - clu.mean_point;
cv::Mat mtp = (cv::Mat_<double>(2, 1) << tp.x, tp.y);
clu.cov += mtp * mtp.t();
}
clu.cov /= (float)(data.size());
}
//根据最大似然函数进行混合聚类,初始均值中心及协方差及权重比例会影响聚类效果,针对特定形状的数据初始条件建议根据统计训练得到,或者根据历史数据统计->预测当前概率的方式来聚类
该部分理论公式参考链接:详解高斯混合聚类(GMM)算法原理
void MyGMM(std::vector<cv::Point2d>& data, int clu_num, std::vector<Cluster>& clues)
{
//初始化均值中心点和协方差
std::vector<Cluster>(clu_num).swap(clues);
srand(time(NULL));
for (auto& item : clues){
int r = (rand() * rand()) % data.size();
item.mean_point = data[r];
item.cov = (cv::Mat_<double>(2, 2) << 1,0,0,1);
item.ratio = 1.0 / clu_num;
item.member_pidx.reserve(data.size());
}
int stable_count = 3;
double end_erro = 1e-5;
int max_itr = 100;
std::vector<double> psum(data.size(), 0);
int itr = 0;
double last_em = -1;
while (true){
std::vector<Cluster> last_clu = clues;
//计算每个点的响应度
double curr_em = 1;
for (std::size_t i = 0; i < data.size(); i++){
double mpsum = 0;
for (std::size_t j = 0; j < clues.size(); j++){
mpsum += clues[j].ratio * Gaussian2D(data[i], last_clu[j].mean_point, last_clu[j].cov);
}
psum[i] = mpsum;
curr_em *= mpsum;
}
if ((last_em >= 0 && std::fabs(last_em - curr_em) < end_erro)
|| (itr >= max_itr && max_itr > 0)){
break;
}
last_em = curr_em;
//更新
for (std::size_t i = 0; i < clues.size(); i++){
double mrsum = 0;
cv::Point2d mrmsum(0,0);
std::vector<double> point_p(data.size(), 0);
for (std::size_t j = 0; j < data.size(); j++){
double p = clues[i].ratio * Gaussian2D(data[j], last_clu[i].mean_point, last_clu[i].cov);
double r = p / psum[j];
point_p[j] = r;
mrmsum += r * data[j];
mrsum += r;
}
//均值更新
clues[i].mean_point = mrmsum / mrsum;
cv::Mat mrcsum = cv::Mat::zeros(2, 2, CV_64FC1);
for (std::size_t j = 0; j < data.size(); j++){
auto tp = data[j] - clues[i].mean_point;
cv::Mat mtp = (cv::Mat_<double>(2, 1) << tp.x, tp.y);
mrcsum += point_p[j] * mtp * mtp.t();
}
//协方差更新
clues[i].cov = mrcsum / mrsum;
//比例更新
clues[i].ratio = mrsum / data.size();
}
itr++;
}
//归类
for (std::size_t i = 0; i < data.size(); i++){
double max_p = -1;
int max_idx = -1;
for (std::size_t j = 0; j < clues.size(); j++){
double p = clues[j].ratio * Gaussian2D(data[i], clues[j].mean_point, clues[j].cov);
if (max_p < p){
max_p = p;
max_idx = j;
}
}
if (max_idx < 0){
continue;
}
//std::cout << i << "=data:" << data[i].x << "," << data[i].y << " is cls id = " << max_idx << ", p = " << max_p << std::endl;
clues[max_idx].member_pidx.push_back(i);
}
}