library(scatterplot3d)
##一键清空
rm(list=ls())
##打印颜色的函数
color<-function(Y1,num){
colo<-rep("black",num);
cl<-c("green","red","blue","yellow")
for (i in 1:num){
colo[i]<-cl[Y1[i]]
}
return (colo)
}
##K-means聚类
kmeans<-function(num,k,v,Y,X,Y2,e){
##从样本集中选择k个样本为初始均值向量
samp<-sample(1:num,k)
for (i in 1:k){
Y[,i]<-X[,samp[i]]
}
repeat{
for (i in 1:num){
##计算样本与各均值向量的距离
distance<-rep(0,k)
for(j in 1:k){
distance[j]<-norm(matrix(X[,i]-Y[,j]))
}
##根据最近的均值向量确定各样本的标记
Y2[i]<-which.min(distance)
}
##更新均值向量
oldY<-Y
Y<-matrix(rep(0,v*k),nrow=v,ncol=k,byrow=TRUE)
for(i in 1:num){
Y[,Y2[i]]<-Y[,Y2[i]]+X[,i]
}
numt<-rep(0,k)
for(i in 1:k){
numt[i]<-sum(Y2==i)
}
for(i in 1:k){
Y[,i]<-Y[,i]/numt[i]
}
##若变化小于精度,返回
cat(norm(oldY-Y))
if(norm(oldY-Y)<e)
break;
}
return (list(one=Y2,two=Y))
}
P<-function(x,Y,xfc,v){
return ((exp(-1/2*t(x-Y)%*%(solve(xfc))%*%(x-Y)))/((2*pi)^(v/2)*det(xfc)^(1/2)))
}
p<-function(j,i,k,hh,xfc,Y,X,v){
fm<-0
for(kk in 1:k){
fm<-fm+hh[kk]*P(X[,j],Y[,kk],xfc[,,kk],v)
}
fz<-hh[i]*P(X[,j],Y[,i],xfc[,,i],v)
return (fz/fm)
}
gmm<-function(num,v,k,X,Y,Y2,e){
##初始化混合系数
hh<-rep(1/k,k)
##初始化均值向量
samp<-sample(1:num,k)
for (i in 1:k){
Y[,i]<-X[,samp[i]]
}
##初始化协方差矩阵
xfc<-array(0.1*diag(v), dim=c(v,v,k))
hygl<-matrix(rep(0,num*k),nrow=num,ncol=k,byrow=TRUE)
repeat{
oldY<-Y
##计算样本由各混合成分生成的后验概率
for (j in 1:num){
for(i in 1:k){
hygl[j,i]<-p(j,i,k,hh,xfc,Y,X,v)
}
}
##print(hygl)
for( i in 1:k){
##计算新均值向量
##计算新协方差矩阵
fz<-rep(0,v)
fm<-0
for(j in 1:num){
fz<-fz+hygl[j,i]*X[,j]
fm<-fm+hygl[j,i]
}
Y[,i]<-fz/fm
fz1<-matrix(rep(0,v*v),nrow=v,ncol=v,byrow=TRUE)
fm1<-0
for(j in 1:num){
fz1<-fz1+hygl[j,i]*(X[,j]-Y[,i])%*%t(X[,j]-Y[,i])
fm1<-fm1+hygl[j,i]
}
xfc[,,i]<-fz1/fm1
##计算新混合系数
hh[i]<-fm1/num
}
cat(norm(oldY-Y))
if(norm(oldY-Y)<e)
break;
}
##print(Y)
##print(xfc)
##print(hh)
for(i in 1:num){
Y2[i]<-which.max(hygl[i,])
}
return (list(one=Y2,two=Y))
}
##簇数量
k<-3
##样本数量
num<-3000
##样本维度
v<-2
##构建样本集
x1<-rnorm(1000,mean=1,sd=0.5)
y1<-rnorm(1000,mean=1,sd=0.6)
x2<-rnorm(1000,mean=4,sd=0.7)
y2<-rnorm(1000,mean=3,sd=0.2)
x3<-rnorm(1000,mean=6,sd=0.8)
y3<-rnorm(1000,mean=1,sd=0.4)
X<-matrix(c(x1,x2,x3,y1,y2,y3),nrow=v,ncol=num,byrow=TRUE)
##均值向量
Y<-matrix(rep(0,v*k),nrow=v,ncol=k,byrow=TRUE)
##真实标记
Y1<-c(rep(1,1000),rep(2,1000),rep(3,1000))
##簇标记
Y2<-c(rep(0,num))
##真实分类
plot(X[1,],X[2,],main='1.3.1',
xlab='x1', ylab='x2',col=color(Y1,num))
##K-means聚类
result<-kmeans(num,k,v,Y,X,Y2,10^(-5))
Y2<-result$one
Y<-result$two
plot(X[1,],X[2,],main='1.3.2',
xlab='x1', ylab='x2',col=color(Y2,num))
for(i in 1:k){
points(Y[1,i],Y[2,i],col="black")
}
##GMM聚类
Y<-matrix(rep(0,v*k),nrow=v,ncol=k,byrow=TRUE)
Y2<-c(rep(0,num))
result<-gmm(num,v,k,X,Y,Y2,10^(-5))
Y2<-result$one
Y<-result$two
plot(X[1,],X[2,],main='1.3.3',
xlab='x1', ylab='x2',col=color(Y2,num))
for(i in 1:k){
points(Y[1,i],Y[2,i],col="black")
}
##UCI数据
k<-3
num<-150
v<-4
data<-read.table("iris.data",sep=",")
hy<-data[1:num,1:v]
dataY<-matrix(rep(0,v*k),nrow=v,ncol=k,byrow=TRUE)
dataY2<-c(rep(0,num))
dataY1<-cbind(data[,v+1])
dataX<-rbind(t(hy))
dataresult<-gmm(num,v,k,dataX,dataY,dataY2,10^(-7))
dataY2<-dataresult$one
dataY<-dataresult$two
scatterplot3d(dataX[1,],dataX[2,],dataX[3,],color=color(dataY2,num),main='1.3.4')
scatterplot3d(dataX[1,],dataX[2,],dataX[3,],color=clr(dataY1,num),main='1.3.5')
turn<-function(str){
if(str=="Iris-setosa"){
return (1)
}
else if(str=="Iris-virginica"){
return (2)
}
else{
return (3)
}
}
clr<-function(Y1,num){
colo<-rep("black",num)
cl<-c("green","red","blue","yellow")
for (i in 1:num){
colo[i]<-cl[turn(Y1[i])]
}
return (colo)
}