原理: 将数据集中的样本划分为若干个通常是不想交的子集,每个子集称为一个“簇”。通过这样的划分,每个簇对英语一些潜在的概念(类别)
1. 基本问题
1)性能度量
对聚类结果评价好坏:簇内相似度高,簇间相似度低
2)距离计算
2. kmeans 聚类
1)找最优的K值
# cluster.stats函数需要使用fpc库
library(fpc); library(ggplot2)
K <- 2:10 # k取2到8评估K
round <- 30 # --每次迭代30次,避免局部最优
n <- 100
g <- 6
set.seed(g)
# 生成数据集
iris <- data.frame(x = unlist(lapply(1:g, function(i) rnorm(n/g, runif(1)*i^2))),
y = unlist(lapply(1:g, function(i) rnorm(n/g, runif(1)*i^2))))
rst <- sapply(K, function(i){
print(paste("K=", i))
mean(sapply(1:round, function(r){
print(paste("Round", r))
result <- kmeans(iris[, 1:2], i)
stats <- cluster.stats(dist(iris[, 1:2]), result$cluster)
stats$avg.silwidth
}))
})
dat <- data.frame(k = K, rst = rst)
ggplot(data = dat, aes(x = k, y = rst)) + geom_line() +
geom_point(aes(x = which.max(rst) + 1, y = max(rst)), size = 2, col = 'red') +
geom_text(aes(x = which.max(rst) + 1, y = max(rst)),
label = paste('(', which.max(rst), ',', round(max(rst), 2), ')'), nudge_y = 0.015)
2) kmeans聚类过程
iris1 <- iris # 注 与上一块代码的iris不是同一个数据集
km <- kmeans(iris1[, 1:4], 3); #分类模型训练
fitted(km); #查看具体分类情况
table(iris1$Species, km$cluster); #查看分类概括
# 评价聚类好坏
# 中心化平方和
ss <- function(x) sum(scale(x, scale = FALSE)^2)
## cluster centers "fitted" to each obs.:
fitted.x <- fitted(km); head(fitted.x)
resid.x <- iris1[, 1:4] - fitted(km)
cbind(km[c("betweenss", "tot.withinss", "totss")], # the same two columns
# betweenss 表示组间的聚类平方和总量
# tot.withinss 表示组内的距离平方和总量
# totss 表示所生成聚类的总体距离平方
c(ss(fitted.x), ss(resid.x), ss(iris1[, 1:4]))) stopifnot(all.equal(km$ totss, ss(iris1[, 1:4])), all.equal(km$ tot.withinss, ss(resid.x)), ## these three are the same: all.equal(km$ betweenss, ss(fitted.x)), all.equal(km$ betweenss, km$totss - km$tot.withinss), ## and hence also all.equal(ss(iris1[, 1:4]), ss(fitted.x) + ss(resid.x)) ) kmeans(iris1[, 1:4], 1)$withinss # trivial one-cluster, (its W.SS == ss(x)) plot(iris1$Sepal.Length, iris1$Sepal.Width, col = km$cluster, pch = as.integer(iris$Species)) points(km$centers, col = 1:2, pch = 8, cex = 2)