AP算法不需要事先指定聚类数目,相反它将所有的数据点都作为潜在的聚类中心,称之为exemplar。以s矩阵的对角线上的数值s(k, k)作为k点能否成为聚类中心的评判标准,这意味着该值越大,这个点成为聚类中心的可能性也就越大,这个值又称作参考度p (preference)。聚类的数量受到参考度p的影响,如果认为每个数据点都有可能作为聚类中心,那么p就应取相同的值。如果取输入的相似度的均值作为p的值,得到聚类数量是中等的。如果取最小值,得到类数较少的聚类。
AP算法中传递两种类型的消息(responsibility)和(availability)。r(i,k)表示从点i发送到候选聚类中心k的数值消息,反映k点是否适合作为i点的聚类中心。a(i,k)则从候选聚类中心k发送到i的数值消息,反映i点是否选择k作为其聚类中心。r (i,k)与a (i, k)越强,则k点作为聚类中心的可能性就越大,并且i点隶属于以k点为聚类中心的聚类的可能性也越大。AP算法通过迭代过程不断更新每一个点的吸引度和归属度值,直到产生m个高质量的exemplar,同时将其余的数据点分配到相应的聚类中。 在这里介绍几个文中常出现的名词:
exemplar:指的是聚类中心。
similarity:数据点i和点j的相似度记为s(i, j)。是指点j作为点i的聚类中心的相似度。一般使用欧氏距离来计算,如 。文中,所有点与点的相似度值全部取为负值。因为我们可以看到,相似度值越大说明点与点的距离越近,便于后面的比较计算。
preference:数据点i的参考度称为p(i)或s(i,i)。是指点i作为聚类中心的参考度。一般取s相似度值的中值。
responsibility: r(i,k)用来描述点k适合作为数据点i的聚类中心的程度。
availability:a(i,k)用来描述点i选择点k作为其聚类中心的适合程度。
两者的关系如下图:
Damping factor(阻尼系数):主要是起收敛作用的。AP聚类算法迭代过程很容易产生震荡,所以一般每次迭代都加上一个阻尼系数([0.5,1)):
先计算n个点之间的相似度值,将值放在s矩阵中,再选取p值(一般取s的中值)。设置一个最大迭代次数maxits (文中设默认值为1000),迭代过程开始。
迭代的过程主要更新两个矩阵,代表(Responsibility)矩阵r=[r(i,k)],(n*n)和适选(Availabilities)矩阵a=[a(i,k)],(n*n)。这两个矩阵初始化为0,n是所有样本的数目。r(i,k)表示第k个样本适合作为第i个样本的类代表点的代表程度,a(i,k)表示第i个样本选择第k个样本作为类代表样本的适合程度。迭代更新公式如(2)(3)。
每次更新后就可以确定当前样本i的代表样本(exemplar)点k,k就是使{a(i,k)+r(i,k)}取得最大值的那个k,如果i=k的话,那么说明样本i就是自己这个cluster的类代表点,如果不是,那么说明i属于k所属的那个cluster。当然,迭代停止的条件就是所有的样本的所属经过连续的convits次迭代都不再变化,或者迭代超过了maxits次。
// ConsoleApplication3.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include<iostream>
#include<stdio.h>
#include<algorithm>
using namespace std;
class Point{
public:
int x;
int y;
int label;
};
double Count_distance(Point point_i, Point point_j){//count the distance between two points
double dis = (point_i.x - point_j.x)*(point_i.x - point_j.x) + (point_i.y - point_j.y)*(point_i.y - point_j.y);
dis = 1/sqrt(dis);
return dis;
}
/*double Get_P_value(double (*similarity_matrix)[20],int N ){
// this is an average p value , sometimes we want to get a mid value
double average = 0.0;
int count = 0;
for (int i = 0; i < N-1; i++){
for (int j = i + 1; j < N; j++){
average += similarity_matrix[i][j];
count += 1;
}
}
return (average / count);
}*/
double Get_P_value(double(*similarity_matrix)[20], int N){
// this is an mid p value
double mid[190];
int index = 0;
for (int i = 0; i < N-1; i++){
for (int j = i + 1; j < N; j++){
mid[index++] = similarity_matrix[i][j];
}
}
sort(mid,mid+190);
return mid[95];
}
double Max_a_plus_s(double(*a)[20], double(*similarity_matrix)[20], int i, int k, int N){ // max{ a(i,k') + s(i,k') }
double max=-100000,tmp;
for (int k_ = 0; k_ < N; k_++){
if (k_ != k){
tmp = a[i][k_] + similarity_matrix[i][k_];
if (tmp>max){
max = tmp;
}
}
}
return max;
}
double max(double a, double b){ // return the max of a and b
return (a > b ? a : b);
}
double min(double a, double b){
return (a < b ? a : b);
}
double Sum_max_1(double(*r)[20], int i, int k, int N){ // Sum( max{0,r(i',k)} ) i'!={i,k}
double sum = 0.0;
for (int i_ = 0; i_<N; i_++){
if (i_ != i && i_ != k){
sum = sum + max(0.0, r[i_][k]);
}
}
return sum;
}
double Sum_max_2(double(*r)[20], int k, int N){
double sum = 0;
for (int i_=0; i_<N; i_++){
if (i_ != k){
sum = sum + max(0.0, r[i_][k]);
}
}
return sum;
}
void Reconstruction_r(double(*similarity_matrix)[20], double(*r)[20], double(*a)[20],int N ,double lamda){ // reconstruct the r matrix
//cout << "reconstruction_r " << endl;
double r_tmp;
for (int i = 0; i < N; i++){
for (int k = 0; k < N; k++){
if (i != k){
r_tmp = similarity_matrix[i][k] - Max_a_plus_s(a, similarity_matrix, i, k, N);//get r_t
r[i][k] = lamda*r[i][k] + (1 - lamda)*r_tmp; // r_new = lamda * r_old + (1-lamda)*r
}
else {
r_tmp = max(-similarity_matrix[i][k], similarity_matrix[i][k] - Max_a_plus_s(a, similarity_matrix, i, k, N));
r[i][k] = lamda*r[i][k] + (1 - lamda)*r_tmp;
}
}
}
}
void Reconstruction_a(double(*similarity_matrix)[20], double(*r)[20], double(*a)[20], int N, double lamda){// reconstruct the a matrix
//cout << "reconstruction_a " << endl;
double a_tmp;
for (int i = 0; i < N; i++){
for (int k = 0; k < N; k++){
if (i != k){
a_tmp = min(0.0, r[k][k] + Sum_max_1(r, i, k,N));// min{ 0,r(k,k)+Sum(max{0,r(i',k)}) } i'!={i,k}
a[i][k] = lamda*a[i][k] + (1 - lamda)*a_tmp;
}
else{
a_tmp = Sum_max_2(r, k, N);
}
}
}
}
int Search_Center(double(*E)[20], int i,int N){ // return max a+r
double max = E[i][0];
int max_label=0;
for (int k = 0; k < N; k++){
if (E[i][k]>max){
max = E[i][k];
max_label = k;
}
}
return max_label;
}
int _tmain(int argc, _TCHAR* argv[])
{
const int N = 20;
Point points[N];
cout << "we are going to init the points" << endl << endl;
for (int i = 0; i < N; i++){
points[i].x = rand()%100;
points[i].y = rand() % 100;
points[i].label = i;
}
cout << "the points' coordinates are:" << endl << "number x y label:" << endl;
for (int i = 0; i < N;i++){
cout << i<<" "<< points[i].x << " " << points[i].y << " " << points[i].label << endl;
}
cout << "we are going to construct the similarity matrix:" << endl<<endl;
double similarity_matrix[N][N];
for (int i = 0; i < N; i++){ // construct the similarity matrix
for (int j = 0; j < N; j++){
if (i != j){
similarity_matrix[i][j] = Count_distance(points[i], points[j]);
}
else if (i == j){
similarity_matrix[i][j] = 0;
}
}
}
double p_value = Get_P_value(similarity_matrix,N);// get the p value
for (int i = 0; i < N; i++){
similarity_matrix[i][i] = p_value;
}
cout << "the similarity matrix and the p value is :" << endl << endl;
for (int i = 0; i < N ; i++){
for (int j = 0; j < N; j++){
cout << similarity_matrix[i][j] << " ";
}
cout << endl;
}
cout << "the p value is " << p_value << endl << endl;
double r[N][N];
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
r[i][j] = 0;
}
}
double a[N][N];
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
a[i][j] = 0;
}
}
int iteration = 1000;
double lamda = 0.618;
while (iteration>0){
Reconstruction_r(similarity_matrix, r, a,N,lamda);
Reconstruction_a(similarity_matrix, r, a,N,lamda);
cout << 100-iteration << endl;
iteration -= 1;
}
double E[N][N];
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
E[i][j] = a[i][j] + r[i][j];
}
}
for (int i = 0; i < N; i++){
points[i].label = Search_Center(E, i,N);
}
/* cout << "the r matrix is :" << endl << endl;
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
cout << r[i][j] << " ";
}
cout << endl;
}
cout << "the a matrix is :" << endl << endl;
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
cout << r[i][j] << " ";
}
cout << endl;
} */
cout << "the E matrix is :" << endl << endl;
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
if (i == j){
cout << E[i][j] << " ";
}
}
}
cout << endl;
cout << "the cluster results is:" << endl;
for (int i = 0; i < N; i++){
cout << points[i].label << " ";
}
cout << endl;
system("pause");
return 0;
}