一、层次聚类
Step 1、缩放数据
data(nutrient, package="flexclust")
df1 <- apply(nutrient, 2, function(x){(x-mean(x))/sd(x)})
df2 <- apply(nutrient, 2, function(x){x/max(x)})
df3 <- apply(nutrient, 2, function(x){(x-mean(x))/mad(x)})
# MAD(X)=median(abs(Xi−median(X)))
df4 <- scale(nutrient) # 将其标准化为均值为0、方差为1
Step 2、计算欧几里得距离
d <- dist(nutrient)
as.matrix(d)[1:5,1:5]
Step 3、选择聚类的个数
library(NbClust)
devAskNewPage(ask=TRUE)
nc <- NbClust(df1, distance="euclidean",
min.nc=2, max.nc=15, method="average")
# min.nc最少聚类数,max.nc=15最多聚类数
table(nc$Best.n[1,]) # 查看最优聚类数
barplot(table(nc$Best.n[1,]),
xlab="Numer of Clusters", ylab="Number of Criteria",
main="Number of Clusters Chosen by 26 Criteria")
Step 4、获取最终的聚类方案
fit.average <- hclust(d, method="average") # 层次聚类
clusters <- cutree(fit.average, k=3) # k=3是上面计算出最优聚类个数是3
table(clusters) # 展示每个聚类下有几个样本
aggregate(df1, by=list(cluster=clusters), median) #描述聚类,每个聚类下各变量的中位数
plot(fit.average, hang=-1, cex=.8, main="Average Linkage Clustering\n3 Cluster Solution") # 树状图
rect.hclust(fit.average, k=3) # 框线指示聚类划分
二、K 均值聚类
data(wine, package="rattle")
# 缩放数据
df1 <- apply(wine[-1], 2, function(x){(x-mean(x))/sd(x)})
# 确定聚类个数
library(NbClust)
devAskNewPage(ask=TRUE)
rc <- NbClust(df1,min.nc=2, max.nc=13, method="kmeans")
# min.nc最少聚类数,max.nc=15最多聚类数
table(rc$Best.n[1,]) # 查看最优聚类数
barplot(table(rc$Best.n[1,]),
xlab="Numer of Clusters", ylab="Number of Criteria",
main="Number of Clusters Chosen by 26 Criteria")
# K均值聚类分析
set.seed(1234)
fit.km <-kmeans(df1, 3, nstart=5) # K均值聚类
# 描述性统计
fit.km$size # 展示每个聚类下有几个样本
fit.km$centers #描述聚类,每个聚类下各变量的均值
aggregate(df1, by=list(cluster=fit.km$cluster), mean) # 功能与上一句相同
aggregate(wine[-1], by=list(cluster=fit.km$cluster), mean) # 观察原始数据的聚类划分情况
a<-table(wine$Type, fit.km$cluster) # 查看每种酒在聚类下的分布
兰德指数(Rand index)来量化类型变量和类之间的协议
library(flexclust)
randIndex(a)
三、围绕质心的划分(PAM)
data(wine, package="rattle")
# 缩放数据
df1 <- apply(wine[-1], 2, function(x){(x-mean(x))/sd(x)})
# 确定聚类个数
library(NbClust)
devAskNewPage(ask=TRUE)
rc <- NbClust(df1,min.nc=2, max.nc=13, method="kmeans")
# min.nc最少聚类数,max.nc=15最多聚类数
table(rc$Best.n[1,]) # 查看最优聚类数
barplot(table(rc$Best.n[1,]),
xlab="Numer of Clusters", ylab="Number of Criteria",
main="Number of Clusters Chosen by 26 Criteria")
library(cluster)
set.seed(1234)
fit.pam <- pam(wine[-1], k=3, stand=TRUE) # 用PAM方法确定聚类划分
fit.pam$medoids # 查看质心
clusplot(fit.pam, main="Bivariate Cluster Plot") # 图示聚类的划分
ct.pam <- table(wine$Type, fit.pam$clustering) # 查看每种酒在聚类下的分布
library(flexclust)
randIndex(a)
四、避免不存在的类
plot(nc$All.index[,4], type="o", ylab="CCC",
xlab="Number of clusters", col="blue")