# -*-coding:utf-8-*-
#将进行数据加载
from numpy import *
from scipy.cluster.vq import kmeans
def loadDataSet(fileName):
fr=open(fileName)
dataWord=[]
for line in fr.readlines():
curLine=line.strip().split("\t")
fitLine=map(float,curLine)#转化为小数
dataWord.append(fitLine)#添加到list中
# print dataWord
return dataWord #他是一个集合中的map[[,],[,],[,]............]
# 计算欧几里得距离
def distEclud(vecA, vecB):
return sqrt(sum(power(vecA - vecB, 2)))
#构建类簇的中心,随机取4个作为随机质心
def randCent(dataSet ,k):
n=shape(dataSet)[1]#q取矩阵行数
centroids=mat(zeros(k.n))#创建一个k行n列的矩阵。k个质心,每个质心n个坐标值
#随机构建n个质心坐标
for j in range(n):
minJ=min(dataSet[:,j])#计算第j列的最小值
maxJ=max(dataSet[:,j])#计算第j列的最大值
rangeJ=float(maxJ-minJ)
centroids[:, j] = minJ + rangeJ * random.rand(k, 1)#将第j列的值赋值为它
return centroids
#k-means 聚类算法
def KMeans(dataSet,k,distMeans=distEclud,createCent=randCent):
m = shape(dataSet)[0]#读取这个矩阵共有多少行
clusterAssment = mat(zeros((m, 2))) # 用于存放该样本属于哪类及质心距离
centroids = createCent(dataSet, k)
clusterChanged = True
while clusterChanged:
clusterChanged = False;
for i in range(m):
minDist = inf;
minIndex = -1;
for j in range(k):
distJI = distMeans(centroids[j, :], dataSet[i, :])#计算质点到数据集点的距离
if distJI < minDist:
minDist = distJI;#将这个值限定为一个区域的划分值
minIndex = j#作为一个分类的标志,这是第j类
if clusterAssment[i, 0] != minIndex: clusterChanged = True;#如果当前的质点不位于这个分类的话,将前面设置的标志位true
clusterAssment[i, :] = minIndex, minDist ** 2#暂时将它归为上一个数据的类,将离质点的距离设为上一个距离的平方,然后不断的迭代计算
print centroids
for cent in range(k):
ptsInClust = dataSet[nonzero(clusterAssment[:, 0].A == cent)[0]] # 去第一列等于cent的所有列
centroids[cent, :] = mean(ptsInClust, axis=0)
return centroids, clusterAssment
print "step 1: load data..."
dataSet = []
dataSet = loadDataSet('D:/testSet.txt')
print "step 2: clustering..."
dataSet = mat(dataSet)
k = 4
centroids, clusterAssment = kmeans(dataSet, k)
print "step 3: show the result..."
KMeans(dataSet, 4, centroids, clusterAssment)
机器学习之k-均值算法的python的实现
最新推荐文章于 2021-08-16 15:50:25 发布