程序清单10-1 K-均值聚类支持函数
import matplotlib
from time import sleep
import urllib
import matplotlib.pyplot as plt
import json
from numpy import *
def loadDataSet(fileName):
"""
输入:将文本文件导入到一个列表中
返回值:一个包含许多其他列表的列表
"""
dataMat = [] # assume last column is target value
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float, curLine)) # map all elements to float()
dataMat.append(fltLine)
return dataMat
def distEclud(vecA, vecB):
"""
计算两个向量的欧氏距离
"""
return sqrt(sum(power(vecA - vecB, 2)))
def randCent(dataSet, k):
"""
为给定数据集构建一个包含k个随机质心的集合,质心大小要在min和max之间。
"""
n = shape(dataSet)[1]
centroids = mat(zeros((k, n))) # create centroid mat
for j in range(n): # create random cluster centers, within bounds of each dimension
minJ = min(dataSet[:, j])
rangeJ = float(max(dataSet[:, j]) - minJ)
centroids[:, j] = mat(minJ + rangeJ * random.rand(k, 1))
return centroids
# 测试(逐条)
# datMat = mat(kMeans.loadDataSet('testSet.txt'))
# datMat
# min(datMat[:, 0])
# min(datMat[:, 1])
# max(datMat[:, 0])
# max(datMat[:, 1])
# kMeans.randCent(datMat, 2) # 两个质心
# kMeans.distEclud(datMat[0], datMat[1]) # 欧氏距离
代码清单10-3 二分k-均值聚类算法
def biKmeans(dataSet, k, distMeas=distEclud):
"""
二分k-均值算法
返回值:质心列表、簇分配结果
"""
m = shape(dataSet)[0]
clusterAssment = mat(zeros((m, 2))) # 存储数据集中每个点的簇分配结果及平方误差
centroid0 = mean(dataSet, axis=0).tolist()[0] # 计算整个数据集的质心
centList = [centroid0] # 使用一个列表来保留所有的质心
for j in range(m): # 计算每个点到质心的误差值
clusterAssment[j, 1] = distMeas(mat(centroid0), dataSet[j, :])**2
while (len(centList) < k):
lowestSSE = inf
for i in range(len(centList)): # 遍历所有的簇
# get the data points currently in cluster i
ptsInCurrCluster = dataSet[nonzero( # 次数据集
clusterAssment[:, 0].A == i)[0], :]
centroidMat, splitClustAss = kMeans(
ptsInCurrCluster, 2, distMeas) # k=2,生成两个质心和个每个簇的误差值
# compare the SSE to the currrent minimum
sseSplit = sum(splitClustAss[:, 1])
sseNotSplit = sum(clusterAssment[nonzero(
clusterAssment[:, 0].A != i)[0], 1])
print("sseSplit, and notSplit: ", sseSplit, sseNotSplit)
if (sseSplit + sseNotSplit) < lowestSSE: # 这些误差与剩余数据集的误差之和作为本次划分的误差
bestCentToSplit = i
bestNewCents = centroidMat
bestClustAss = splitClustAss.copy()
lowestSSE = sseSplit + sseNotSplit
# 当使用kMeans()函数并且指定簇数为2时,会得到两个编号分别为0和1的结果簇。需要将这些簇编号修改为划分簇
# 及新加簇的编号,该过程可以通过两个数组过滤器来完成。
bestClustAss[nonzero(bestClustAss[:, 0].A == 1)[0], 0] = len(
centList) # change 1 to 3,4, or whatever
bestClustAss[nonzero(bestClustAss[:, 0].A == 0)
[0], 0] = bestCentToSplit
print('the bestCentToSplit is: ', bestCentToSplit)
print('the len of bestClustAss is: ', len(bestClustAss))
# replace a centroid with two best centroids
centList[bestCentToSplit] = bestNewCents[0, :].tolist()[
0] # 新的簇分配结果被更新
centList.append(bestNewCents[1, :].tolist()[0]) # 新的质心会被添加到列表centList中
clusterAssment[nonzero(clusterAssment[:, 0].A == bestCentToSplit)[
0], :] = bestClustAss # reassign new clusters, and SSE
return mat(centList), clusterAssment
# 测试代码
# import kMeans
# datMat3 = mat(kMeans.loadDataSet('testSet2.txt'))
# centList,myNewAssments = kMeans.biKmeans(datMat3,3)
# centList