# -*-coding:utf-8
import numpy as np
def loadDataSet(fileName):
dataMat = []
f = open(fileName)
for _line_ in f.readlines():
_curLine = _line_.strip().split('\t')
_fitLine = map(float,_curLine) # 浮点型数据
dataMat.append(_fitLine) # 数组
return dataMat
def distEclud(vecA,vecB):
'''
计算两个向量的欧式距离
'''
return np.sqrt(np.sum(np.power(vecA-vecB,2)))
def randCent(dataMat,k):
'''
构建包含k个随机质心的数组
每一个都包含n维,即数据的特征数
'''
m,n = np.shape(dataMat) # 数据集的特征数
centroids = np.mat(np.zeros((k,n))) # 初始化一个k*n矩阵
for j in range(n): # 对每一特征
_minJ = np.min(dataMat[:,j]) # 该特征的最小值
_rangeJ = float(np.max(dataMat[:,j]) - _minJ) # 特征最大值 - 最小值
centroids[:,j] = _minJ + _rangeJ * np.random.rand(k,1) # # 对每一维的随机数在该特征的范围内
return centroids
def kMeans(dataMat, k, distMeas=distEclud, createCent=randCent):
'''
数据集 | 簇数 | 计算距离 | 创建初始质心
计算质心-分配-重新计算,反复迭代,直到所有数据点的簇分配结果不再改变为止
'''
m,n = np.shape(dataMat)
clusterAssment = np.mat(np.zeros((m,2))) # 初始化矩阵,存放每个点的簇分配结果:簇索引值|距离
centroids = createCent(dataMat,k) # 随机选择k个质心
_clusterChanged = True
while _clusterChanged:
_clusterChanged = False
for i in range(m): # 遍历数据,找到离数据最近的质心,将数据分配到对应的簇
_minDist = np.inf # 初始距离设为无穷远
_minIndex = -1 # 具有最短的数据索引
for j in range(k): # 数据与所有k个质心的距离,找到最短距离的质心
_distJI = distMeas(centroids[j,:],dataMat[i,:])
if _distJI < _minDist:
_minDist = _distJI
_minIndex = j
if clusterAssment[i,0] != _minIndex: # 如果任一点的簇分配结果发生改变,则更新标志
_clusterChanged = True
clusterAssment[i,:] = _minIndex, _minDist**2 # 数据距离最短的簇,最短距离
for _cent_ in range(k): # 遍历所有质心并更新它们的取值
_ptsInClust = dataMat[np.nonzero(clusterAssment[:,0].A == _cent_)[0]] # 数据集中属于同一个簇的所有值
centroids[_cent_,:] = np.mean(_ptsInClust,axis=0) # 计算这些数据的平均值 --->质心
return centroids,clusterAssment
def biKmeans(dataMat, k, distMeas=distEclud):
m,n = np.shape(dataMat)
clusterAssment = np.mat(np.zeros((m,2))) # 初始化矩阵,存放每个点的簇分配结果和平方误差
_centroid0 = np.mean(dataMat,axis=0).tolist()[0] # 所有数据的均值作为初始簇
centList = [_centroid0] # 簇列表
for j in range(m): # 遍历数据集中的所有点
clusterAssment[j,1] = distMeas(np.mat(_centroid0), dataMat[j,:])**2 # 计算距离质心的距离,存储到第二列中,当前只有一个簇,一个质心
while (len(centList) < k): # 当前簇的数目<要求个数,循环
_lowestSSE = np.inf # 最小误差设为无穷大
for i in range(len(centList)): # 遍历所有的簇,寻找最佳簇二分:选择二分后误差之和最小的簇
_ptsInCurrCluster = dataMat[np.nonzero(clusterAssment[:,0].A==i)[0],:] # 该簇中的所有点看成一个小的数据集
_centroidMat,_splitClustAss = kMeans(_ptsInCurrCluster,2,distMeas=distEclud) # 将指定簇下面数据集到kmeans中二分,返回两个簇质心 | 数据点到簇质心的距离
_sseSplit = np.sum(_splitClustAss[:,1]) # 指定簇下面数据集二分后的误差和
_sseNotSplit = np.sum(clusterAssment[np.nonzero(clusterAssment[:,0].A !=i)[0],1]) # 指定簇以外数据集的误差和
print "sseSplit, and notSplit: ",_sseSplit,_sseNotSplit
if (_sseSplit + _sseNotSplit) < _lowestSSE: # 如果该次划分的SSE变小,划分保存
_bestCentToSplit = i # 最佳二分簇的索引
_bestNewCents = _centroidMat # 二分后新的簇质心
_bestClustAss = _splitClustAss.copy() # 二分后数据点所属新的簇索引和距离
_lowestSSE = _sseSplit + _sseNotSplit # 二分后最小的距离和
'''
更新簇的分配结果,只更改进行进行二分后的数据点所属的分类
0:维持二分之前的簇索引号
1:更改为当前最大的簇索引号
'''
_bestClustAss[np.nonzero(_bestClustAss[:,0].A == 1)[0],0] = len(centList)
_bestClustAss[np.nonzero(_bestClustAss[:,0].A == 0)[0],0] = _bestCentToSplit
print 'the bestCentToSplit is: ',_bestCentToSplit # 最佳二分簇索引号
print 'the len of bestClustAss is: ',len(_bestClustAss) # 二分后现有的簇个数
centList[_bestCentToSplit] = _bestNewCents[0,:].tolist()[0] # 在簇列表中,原来最佳二分簇质心更改为二分后的两个簇质心的其中一个
centList.append(_bestNewCents[1,:].tolist()[0]) # 二分后的两个簇质心的另外一个添加到簇列表的最后
clusterAssment[np.nonzero(clusterAssment[:,0].A == _bestCentToSplit)[0],:] = _bestClustAss # 更新数据点簇分配结果和平方误差
return np.mat(centList),clusterAssment
def distSLC(vecA,vecB):
'''
返回地球表面两点间距离,球面余弦定理
'''
pi = np.pi
a = np.sin(vecA[0,1]*pi/180) * np.sin(vecB[0,1]*pi/180)
b = np.cos(vecA[0,1]*pi/180) * np.cos(vecB[0,1]*pi/180) * np.cos((vecB[0,0]-vecA[0,0])*pi/180)
return np.arccos(a+b)*6371.0
def clusterClubs(numClust=5):
import matplotlib.pyplot as plt
datList = []
for line in open('places.txt').readlines():
lineArr = line.split('\t')
datList.append([float(lineArr[4]),float(lineArr[3])])
datMat = np.mat(datList)
myCentroids,clustAssing = biKmeans(datMat,numClust,distMeas=distSLC)
fig = plt.figure()
rect=[0.1,0.1,0.8,0.8]
scatterMarkers=['s', 'o', '^', '8', 'p', \
'd', 'v', 'h', '>', '<']
axprops = dict(xticks=[], yticks=[])
ax0=fig.add_axes(rect, label='ax0', **axprops)
ax0.imshow('Portland.png')
ax1=fig.add_axes(rect, label='ax1', frameon=False)
for i in range(numClust):
ptsInCurrCluster = datMat[np.nonzero(clustAssing[:,0].A==i)[0],:]
markerStyle = scatterMarkers[i % len(scatterMarkers)]
ax1.scatter(ptsInCurrCluster[:,0].flatten().A[0], ptsInCurrCluster[:,1].flatten().A[0], marker=markerStyle, s=90)
ax1.scatter(myCentroids[:,0].flatten().A[0], myCentroids[:,1].flatten().A[0], marker='+', s=300)
plt.show()
def plotpoint(dataMat,centroids,clustassing):
import matplotlib.pyplot as plt
xcord0 = []
ycord0 = []
xcord1 = []
ycord1 = []
color = []
for i in range(len(dataMat)):
xcord0.append(dataMat[i,0])
ycord0.append(dataMat[i,1])
color.append(clustassing[i,0])
for j in range(len(centroids)):
xcord1.append(centroids[j,0])
ycord1.append(centroids[j,1])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord0,ycord0,marker='o',s=50,c=color)
ax.scatter(xcord1,ycord1,marker='*',s=100,c='red')
plt.show()
Kmeans
最新推荐文章于 2019-03-26 10:15:39 发布