广义上来说,任何在算法中用到SVD/特征值分解的,都叫Spectral Algorithm。顺便说一下,对于任意矩阵只存在奇异值分解,不存在特征值分解。对于正定的对称矩阵,奇异值就是特征值,奇异向量就是特征向量。
传统的聚类算法,如K-Means、EM算法都是建立在凸球形样本空间上,当样本空间不为凸时,算法会陷入局部最优,最终结果受初始参数的选择影响比较大。而谱聚类可以在任意形状的样本空间上聚类,且收敛于全局最优解。
谱聚类和CHAMELEON聚类很像,都是把样本点的相似度放到一个带权无向图中,采用“图划分”的方法进行聚类。只是谱聚类算法在进行图划分的时候发现计算量很大,转而求特征值去了,而且最后还在几个小特征向量组成的矩阵上进行了K-Means聚类。
Simply speaking,谱聚类算法分为3步:
- 构造一个N×N的权值矩阵W,Wij表示样本i和样本j的相似度,显然W是个对称矩阵。相似度的计算方法很多了,你可以用欧拉距离、街区距离、向量夹角、皮尔森相关系数等。并不是任意两个点间的相似度都要表示在图上,我们希望的权值图是比较稀疏的,有2种方法:权值小于阈值的认为是0;K最邻近方法,即每个点只和跟它最近的k个点连起来,CHAMELEON算法的第1阶段就是这么干的。再构造一个对角矩阵D,Dii为W第i列元素之和。最后构造矩阵L=D-W。可以证明L是个半正定和对称矩阵。
- 求L的前K小特征值对应的特征向量(这要用到奇异值分解了)。把K个特征向量放在一起构造一个N×K的矩阵M。
- 把M的每一行当成一个新的样本点,对这N个新的样本点进行K-Means聚类。
从文件读入样本点,最终算得矩阵L
#include<math.h>
#include<string.h>
#include"matrix.h"
#include"svd.h"
#define N 19 //样本点个数
#define K 4 //K-Means算法中的K
#define T 0.1 //样本点之间相似度的阈值
double
sample[N][2];
//存放所有样本点的坐标(2维的)
void
readSample(
char
*filename){
FILE
*fp;
if
((fp=
fopen
(filename,
"r"
))==NULL){
perror
(
"fopen"
);
exit
(0);
}
char
buf[50]={0};
int
i=0;
while
(
fgets
(buf,
sizeof
(buf),fp)!=NULL){
char
*w=
strtok
(buf,
" \t"
);
double
x=
atof
(w);
w=
strtok
(NULL,
" \t"
);
double
y=
atof
(w);
sample[i][0]=x;
sample[i][1]=y;
i++;
memset
(buf,0x00,
sizeof
(buf));
}
assert
(i==N);
fclose
(fp);
}
double
** getSimMatrix(){
//为二维矩阵申请空间
double
**matrix=getMatrix(N,N);
//计算样本点两两之间的相似度,得到矩阵W
int
i,j;
for
(i=0;i<N;i++){
matrix[i][i]=1;
for
(j=i+1;j<N;j++){
double
dist=
sqrt
(
pow
(sample[i][0]-sample[j][0],2)+
pow
(sample[i][1]-sample[j][1],2));
double
sim=1.0/(1+dist);
if
(sim>T){
matrix[j][i]=sim;
matrix[i][j]=sim;
}
}
}
//计算L=D-W
for
(j=0;j<N;j++){
double
sum=0;
for
(i=0;i<N;i++){
sum+=matrix[i][j];
if
(i!=j)
matrix[i][j]=0-matrix[i][j];
}
matrix[j][j]=matrix[j][j]-sum;
}
return
matrix;
}
int
main(){
char
*file=
"/home/orisun/data"
;
readSample(file);
double
**L=getSimMatrix();
printMatrix(L,N,N);
double
**M=singleVector(L,N,N,5);
printMatrix(M,N,5);
freeMatrix(L,N);
return
0;
}
|
L已是对称矩阵,直接奇异值分解的得到的就是特征向量
#ifndef _MATRIX_H
#define _MATRIX_H
#include<assert.h>
#include<stdlib.h>
#include<stdio.h>
//初始化一个二维矩阵
double
** getMatrix(
int
rows,
int
columns){
double
**rect=(
double
**)
calloc
(rows,
sizeof
(
double
*));
int
i;
for
(i=0;i<rows;++i)
rect[i]=(
double
*)
calloc
(columns,
sizeof
(
double
));
return
rect;
}
//返回一个单位矩阵
double
** getIndentityMatrix(
int
rows){
double
** IM=getMatrix(rows,rows);
int
i;
for
(i=0;i<rows;++i)
IM[i][i]=1.0;
return
IM;
}
//返回一个矩阵的副本
double
** copyMatrix(
double
** matrix,
int
rows,
int
columns){
double
** rect=getMatrix(rows,columns);
int
i,j;
for
(i=0;i<rows;++i)
for
(j=0;j<columns;++j)
rect[i][j]=matrix[i][j];
return
rect;
}
//从一个一维矩阵得到一个二维矩阵
void
getFromArray(
double
** matrix,
int
rows,
int
columns,
double
*arr){
int
i,j,k=0;
for
(i=0;i<rows;++i){
for
(j=0;j<columns;++j){
matrix[i][j]=arr[k++];
}
}
}
//打印二维矩阵
void
printMatrix(
double
** matrix,
int
rows,
int
columns){
int
i,j;
for
(i=0;i<rows;++i){
for
(j=0;j<columns;++j){
printf
(
"%-10f\t"
,matrix[i][j]);
}
printf
(
"\n"
);
}
}
//释放二维矩阵
void
freeMatrix(
double
** matrix,
int
rows){
int
i;
for
(i=0;i<rows;++i)
free
(matrix[i]);
free
(matrix);
}
//获取二维矩阵的某一行
double
* getRow(
double
**matrix,
int
rows,
int
columns,
int
index){
assert
(index<rows);
double
*rect=(
double
*)
calloc
(columns,
sizeof
(
double
));
int
i;
for
(i=0;i<columns;++i)
rect[i]=matrix[index][i];
return
rect;
}
//获取二维矩阵的某一列
double
* getColumn(
double
**matrix,
int
rows,
int
columns,
int
index){
assert
(index<columns);
double
*rect=(
double
*)
calloc
(rows,
sizeof
(
double
));
int
i;
for
(i=0;i<rows;++i)
rect[i]=matrix[i][index];
return
rect;
}
//设置二维矩阵的某一列
void
setColumn(
double
**matrix,
int
rows,
int
columns,
int
index,
double
*arr){
assert
(index<columns);
int
i;
for
(i=0;i<rows;++i)
matrix[i][index]=arr[i];
}
//交换矩阵的某两列
void
exchangeColumn(
double
**matrix,
int
rows,
int
columns,
int
i,
int
j){
assert
(i<columns);
assert
(j<columns);
int
row;
for
(row=0;row<rows;++row){
double
tmp=matrix[row][i];
matrix[row][i]=matrix[row][j];
matrix[row][j]=tmp;
}
}
//得到矩阵的转置
double
** getTranspose(
double
**matrix,
int
rows,
int
columns){
double
**rect=getMatrix(columns,rows);
int
i,j;
for
(i=0;i<columns;++i){
for
(j=0;j<rows;++j){
rect[i][j]=matrix[j][i];
}
}
return
rect;
}
//计算两向量内积
double
vectorProduct(
double
*vector1,
double
*vector2,
int
len){
double
rect=0.0;
int
i;
for
(i=0;i<len;++i)
rect+=vector1[i]*vector2[i];
return
rect;
}
//两个矩阵相乘
double
** matrixProduct(
double
**matrix1,
int
rows1,
int
columns1,
double
**matrix2,
int
columns2){
double
**rect=getMatrix(rows1,columns2);
int
i,j;
for
(i=0;i<rows1;++i){
for
(j=0;j<columns2;++j){
double
*vec1=getRow(matrix1,rows1,columns1,i);
double
*vec2=getColumn(matrix2,columns1,columns2,j);
rect[i][j]=vectorProduct(vec1,vec2,columns1);
free
(vec1);
free
(vec2);
}
}
return
rect;
}
//得到某一列元素的平方和
double
getColumnNorm(
double
** matrix,
int
rows,
int
columns,
int
index){
assert
(index<columns);
double
* vector=getColumn(matrix,rows,columns,index);
double
norm=vectorProduct(vector,vector,rows);
free
(vector);
return
norm;
}
//打印向量
void
printVector(
double
* vector,
int
len){
int
i;
for
(i=0;i<len;++i)
printf
(
"%-15.8f\t"
,vector[i]);
printf
(
"\n"
);
}
#endif
|
#include"matrix.h"
#define ITERATION 100 //单边Jacobi最大迭代次数
#define THREASHOLD 0.1
//符号函数
int
sign(
double
number) {
if
(number<0)
return
-1;
else
return
1;
}
//两个向量进行单边Jacobi正交变换
void
orthogonalVector(
double
*Ci,
double
*Cj,
int
len1,
double
*Vi,
double
*Vj,
int
len2,
int
*pass){
double
ele=vectorProduct(Ci,Cj,len1);
if
(
fabs
(ele)<THREASHOLD)
return
;
//如果两列已经正交,不需要进行变换,则返回true
*pass=0;
double
ele1=vectorProduct(Ci,Ci,len1);
double
ele2=vectorProduct(Cj,Cj,len1);
double
tao=(ele1-ele2)/(2*ele);
double
tan
=sign(tao)/(
fabs
(tao)+
sqrt
(1+
pow
(tao,2)));
double
cos
=1/
sqrt
(1+
pow
(
tan
,2));
double
sin
=
cos
*
tan
;
int
row;
for
(row=0;row<len1;++row){
double
var1=Ci[row]*
cos
+Cj[row]*
sin
;
double
var2=Cj[row]*
cos
-Ci[row]*
sin
;
Ci[row]=var1;
Cj[row]=var2;
}
for
(row=0;row<len2;++row){
double
var1=Vi[row]*
cos
+Vj[row]*
sin
;
double
var2=Vj[row]*
cos
-Vi[row]*
sin
;
Vi[row]=var1;
Vj[row]=var2;
}
}
//矩阵的两列进行单边Jacobi正交变换。V是方阵,行/列数为columns
void
orthogonal(
double
**matrix,
int
rows,
int
columns,
int
i,
int
j,
int
*pass,
double
**V){
assert
(i<j);
double
* Ci=getColumn(matrix,rows,columns,i);
double
* Cj=getColumn(matrix,rows,columns,j);
double
* Vi=getColumn(V,columns,columns,i);
double
* Vj=getColumn(V,columns,columns,j);
orthogonalVector(Ci,Cj,rows,Vi,Vj,columns,pass);
int
row;
for
(row=0;row<rows;++row){
matrix[row][i]=Ci[row];
matrix[row][j]=Cj[row];
}
for
(row=0;row<columns;++row){
V[row][i]=Vi[row];
V[row][j]=Vj[row];
}
free
(Ci);
free
(Cj);
free
(Vi);
free
(Vj);
}
//循环正交,进行奇异值分解
void
hestens_jacobi(
double
**matrix,
int
rows,
int
columns,
double
**V)
{
int
iteration = ITERATION;
while
(iteration-- > 0) {
int
pass = 1;
int
i,j;
for
(i = 0; i < columns; ++i) {
for
(j = i+1; j < columns; ++j) {
orthogonal(matrix,rows,columns,i,j,&pass,V);
//经过多次的迭代正交后,V就求出来了
}
}
if
(pass==1)
//当任意两列都正交时退出迭代
break
;
}
printf
(
"迭代次数:%d\n"
,ITERATION - iteration);
}
//获取矩阵前n小的奇异向量
double
**singleVector(
double
**A,
int
rows,
int
columns,
int
n){
double
**V=getIndentityMatrix(columns);
hestens_jacobi(A,rows,columns,V);
double
*singular=(
double
*)
calloc
(columns,
sizeof
(
double
));
//特征值
int
i,j;
for
(i=0;i<columns;++i){
double
*vector=getColumn(A,rows,columns,i);
double
norm=
sqrt
(vectorProduct(vector,vector,rows));
singular[i]=norm;
}
int
*sort=(
int
*)
calloc
(columns,
sizeof
(
int
));
for
(i=0;i<columns;++i)
sort[i]=i;
for
(i=0;i<columns-1;++i){
int
minIndex=i;
int
minValue=singular[i];
for
(j=i+1;j<columns;++j){
if
(singular[j]<minValue){
minValue=singular[j];
minIndex=j;
}
}
//交换sigular的第i个和第minIndex个元素
singular[minIndex]=singular[i];
singular[i]=minValue;
//交换sort的第i个和第minIndex个元素
int
tmp=sort[minIndex];
sort[minIndex]=sort[i];
sort[i]=tmp;
}
double
**rect=getMatrix(rows,n);
for
(i=0;i<rows;++i){
for
(j=0;j<n;++j){
rect[i][j]=V[i][sort[j]];
}
}
freeMatrix(V,columns);
free
(sort);
free
(singular);
return
rect;
}
|
最后是运行KMeans的Java代码
package
ai;
public
class
Global {
//计算两个向量的欧氏距离
public
static
double
calEuraDist(
double
[] arr1,
double
[] arr2,
int
len){
double
result=
0.0
;
for
(
int
i=
0
;i<len;i++){
result+=Math.pow(arr1[i]-arr2[i],
2.0
);
}
return
Math.sqrt(result);
}
}
|
package
ai;
public
class
DataObject {
String docname;
double
[] vector;
int
cid;
boolean
visited;
public
DataObject(
int
len){
vector=
new
double
[len];
}
public
String getName() {
return
docname;
}
public
void
setName(String docname) {
this
.docname = docname;
}
public
double
[] getVector() {
return
vector;
}
public
void
setVector(
double
[] vector) {
this
.vector = vector;
}
public
int
getCid() {
return
cid;
}
public
void
setCid(
int
cid) {
this
.cid = cid;
}
public
boolean
isVisited() {
return
visited;
}
public
void
setVisited(
boolean
visited) {
this
.visited = visited;
}
}
|
package
ai;
import
java.io.BufferedReader;
import
java.io.File;
import
java.io.FileReader;
import
java.io.IOException;
import
java.util.ArrayList;
import
java.util.Iterator;
public
class
DataSource {
ArrayList<DataObject> objects;
int
row;
int
col;
public
void
readMatrix(File dataFile) {
try
{
FileReader fr =
new
FileReader(dataFile);
BufferedReader br =
new
BufferedReader(fr);
String line = br.readLine();
String[] words = line.split(
"\\s+"
);
row = Integer.parseInt(words[
0
]);
// row=1000;
col = Integer.parseInt(words[
1
]);
objects =
new
ArrayList<DataObject>(row);
for
(
int
i =
0
; i < row; i++) {
DataObject object =
new
DataObject(col);
line = br.readLine();
words = line.split(
"\\s+"
);
for
(
int
j =
0
; j < col; j++) {
object.getVector()[j] = Double.parseDouble(words[j]);
}
objects.add(object);
}
br.close();
}
catch
(IOException e) {
e.printStackTrace();
}
}
public
void
readRLabel(File file) {
try
{
FileReader fr =
new
FileReader(file);
BufferedReader br =
new
BufferedReader(fr);
String line =
null
;
for
(
int
i =
0
; i < row; i++) {
line = br.readLine();
objects.get(i).setName(line.trim());
}
}
catch
(IOException e) {
e.printStackTrace();
}
}
public
void
printResult(ArrayList<DataObject> objects,
int
n) {
//DBScan是从第1类开始,K-Means是从第0类开始
// for (int i =0; i <n; i++) {
for
(
int
i=
1
;i<=n;i++){
System.out.println(
"=============属于第"
+i+
"类的有:==========================="
);
Iterator<DataObject> iter = objects.iterator();
while
(iter.hasNext()) {
DataObject object = iter.next();
int
cid=object.getCid();
if
(cid==i){
System.out.println(object.getName());
// switch(Integer.parseInt(object.getName())/1000){
// case 0:
// System.out.println(0);
// break;
// case 1:
// System.out.println(1);
// break;
// case 2:
// System.out.println(2);
// break;
// case 3:
// System.out.println(3);
// break;
// case 4:
// System.out.println(4);
// break;
// case 5:
// System.out.println(5);
// break;
// default:
// System.out.println("Go Out");
// break;
// }
}
}
}
}
}
|
package
ai;
import
java.io.File;
import
java.util.ArrayList;
import
java.util.Iterator;
import
java.util.Random;
public
class
KMeans {
int
k;
// 指定划分的簇数
double
mu;
// 迭代终止条件,当各个新质心相对于老质心偏移量小于mu时终止迭代
double
[][] center;
// 上一次各簇质心的位置
int
repeat;
// 重复运行次数
double
[] crita;
// 存放每次运行的满意度
public
KMeans(
int
k,
double
mu,
int
repeat,
int
len) {
this
.k = k;
this
.mu = mu;
this
.repeat = repeat;
center =
new
double
[k][];
for
(
int
i =
0
; i < k; i++)
center[i] =
new
double
[len];
crita =
new
double
[repeat];
}
// 初始化k个质心,每个质心是len维的向量,每维均在left--right之间
public
void
initCenter(
int
len, ArrayList<DataObject> objects) {
Random random =
new
Random(System.currentTimeMillis());
int
[] count =
new
int
[k];
// 记录每个簇有多少个元素
Iterator<DataObject> iter = objects.iterator();
while
(iter.hasNext()) {
DataObject object = iter.next();
int
id = random.nextInt(
10000
)%k;
count[id]++;
for
(
int
i =
0
; i < len; i++)
center[id][i] += object.getVector()[i];
}
for
(
int
i =
0
; i < k; i++) {
for
(
int
j =
0
; j < len; j++) {
center[i][j] /= count[i];
}
}
}
// 把数据集中的每个点归到离它最近的那个质心
public
void
classify(ArrayList<DataObject> objects) {
Iterator<DataObject> iter = objects.iterator();
while
(iter.hasNext()) {
DataObject object = iter.next();
double
[] vector = object.getVector();
int
len = vector.length;
int
index =
0
;
double
neardist = Double.MAX_VALUE;
for
(
int
i =
0
; i < k; i++) {
double
dist = Global.calEuraDist(vector, center[i], len);
// 使用欧氏距离
if
(dist < neardist) {
neardist = dist;
index = i;
}
}
object.setCid(index);
}
}
// 重新计算每个簇的质心,并判断终止条件是否满足,如果不满足更新各簇的质心,如果满足就返回true.len是数据的维数
public
boolean
calNewCenter(ArrayList<DataObject> objects,
int
len) {
boolean
end =
true
;
int
[] count =
new
int
[k];
// 记录每个簇有多少个元素
double
[][] sum =
new
double
[k][];
for
(
int
i =
0
; i < k; i++)
sum[i] =
new
double
[len];
Iterator<DataObject> iter = objects.iterator();
while
(iter.hasNext()) {
DataObject object = iter.next();
int
id = object.getCid();
count[id]++;
for
(
int
i =
0
; i < len; i++)
sum[id][i] += object.getVector()[i];
}
for
(
int
i =
0
; i < k; i++) {
if
(count[i] !=
0
) {
for
(
int
j =
0
; j < len; j++) {
sum[i][j] /= count[i];
}
}
// 簇中不包含任何点,及时调整质心
else
{
int
a=(i+
1
)%k;
int
b=(i+
3
)%k;
int
c=(i+
5
)%k;
for
(
int
j =
0
; j < len; j++) {
center[i][j] = (center[a][j]+center[b][j]+center[c][j])/
3
;
}
}
}
for
(
int
i =
0
; i < k; i++) {
// 只要有一个质心需要移动的距离超过了mu,就返回false
if
(Global.calEuraDist(sum[i], center[i], len) >= mu) {
end =
false
;
break
;
}
}
if
(!end) {
for
(
int
i =
0
; i < k; i++) {
for
(
int
j =
0
; j < len; j++)
center[i][j] = sum[i][j];
}
}
return
end;
}
// 计算各簇内数据和方差的加权平均,得出本次聚类的满意度.len是数据的维数
public
double
getSati(ArrayList<DataObject> objects,
int
len) {
double
satisfy =
0.0
;
int
[] count =
new
int
[k];
double
[] ss =
new
double
[k];
Iterator<DataObject> iter = objects.iterator();
while
(iter.hasNext()) {
DataObject object = iter.next();
int
id = object.getCid();
count[id]++;
for
(
int
i =
0
; i < len; i++)
ss[id] += Math.pow(object.getVector()[i] - center[id][i],
2.0
);
}
for
(
int
i =
0
; i < k; i++) {
satisfy += count[i] * ss[i];
}
return
satisfy;
}
public
double
run(
int
round, DataSource datasource,
int
len) {
System.out.println(
"第"
+ round +
"次运行"
);
initCenter(len,datasource.objects);
classify(datasource.objects);
while
(!calNewCenter(datasource.objects, len)) {
classify(datasource.objects);
}
datasource.printResult(datasource.objects, k);
double
ss = getSati(datasource.objects, len);
System.out.println(
"加权方差:"
+ ss);
return
ss;
}
public
static
void
main(String[] args) {
DataSource datasource =
new
DataSource();
datasource.readMatrix(
new
File(
"/home/orisun/test/dot.mat"
));
datasource.readRLabel(
new
File(
"/home/orisun/test/dot.rlabel"
));
int
len = datasource.col;
// 划分为4个簇,质心移动小于1E-8时终止迭代,重复运行7次
KMeans km =
new
KMeans(
4
, 1E-
10
,
7
, len);
int
index =
0
;
double
minsa = Double.MAX_VALUE;
for
(
int
i =
0
; i < km.repeat; i++) {
double
ss = km.run(i, datasource, len);
if
(ss < minsa) {
minsa = ss;
index = i;
}
}
System.out.println(
"最好的结果是第"
+ index +
"次。"
);
}
}
|
原文来自:博客园(华夏35度)http://www.cnblogs.com/zhangchaoyang 作者:Orisun