#----------------------------------------------工具类代码-------------------------
import os
import numpy as np
import string
import matplotlib.pyplot as pl
'''
-author-: kenny adelaide
time: 2018/3/28
company: 西华师范大学2期理科楼B313实验室
description: cluster 算法实现
classfiy : cluster-ulti 工具包
'''
# 链式编写法则
class cluster:
'''
#这是一个工具类
'''
def __init__(self):
pass
# 初始化cluster k 均值算法的相关参数
def initParam(self,params):
if len(params)>0:
self.params=params
else:
print('请初始化cluster 算法的相关参数....')
return
self.params=params
self.u={}
self.path=self.params['path'] # 数据源文件的路径
self.flag=self.params['flag'] # 数据源文件的操作方式 r : read, w: write
self.CiNumber=int(self.params['CiNumber']) # 这是一个将整个数据集划分的一个标量
self.u[0]= self.params['u1']
self.u[1] = self.params['u2']
self.u[2] = self.params['u3']
self.assemblys={0:[],1:[],2:[]};
#print(self.u)
return self
# read data from data.txt
@property
def read_datafromfile(self):
f = open(self.path, self.flag)
lines = f.readlines()
lines = [lines[i].lstrip().strip().split(' ') for i in np.arange(len(lines))]
self.lines=lines
#print(lines)
return self
# 计算样本xj 与每个均值向量u 之间的距离
def distanceU(self,line,u):
distance=0.0
m={0:[],1:[],2:[]}
for i in np.arange(len(u)):
distance1= (float(u[i][0]) - float(line[0]))*(float(u[i][0]) - float(line[0]))
distance2 = (float(u[i][1]) - float(line[1])) * (float(u[i][1]) - float(line[1]))
distance = np.abs(np.sqrt(distance1+distance2)) ;
m[i]= float(distance)
return np.min(m) # 返回最小的距离
# 计算新的均值向量
def updateU(self):
for i in np.arange(3):
sum1 = 1;
sum2 = 1;
p=2.000
lens=0
if len(self.assemblys[i])>=0:
lens=len(self.assemblys[i])
p= float(1/(lens+1)) # 计算Ci 的长度
for j in np.arange(len(self.assemblys[i])):
sum1= sum1+ float(self.assemblys[i][j][0])
sum2= sum2+ float(self.assemblys[i][j][1])
# 更新当前的均值向量u的赋值
sum1=sum1*p
sum2=sum2*p
self.u[i][0]= float(str(sum1)[0:5])
self.u[i][1] = float(str(sum2)[0:5])
return self
# 分类器函数
@property
def culculate(self):
for j in np.arange(1000):
for i in np.arange(len(self.lines)):
# 计算样本xj 与每个均值向量u 之间的距离
mindistance=self.distanceU(self.lines[i],self.u)
#print(mindistance)
result=((mindistance[0] if mindistance[0] < mindistance[1] else mindistance[1] )
if (mindistance[0] if mindistance[0] < mindistance[1] else mindistance[1] )
<mindistance[2] else mindistance[2])
#print(result)
for k in np.arange(len(mindistance)):
if mindistance[k]==result:
self.assemblys[k].append(self.lines[i])
#根据距离进的均值向量确定xj 的簇标记
# 将样本xj 划入相应的簇
for i in np.arange(self.CiNumber):
# 计算新的均值向量
# 跟新当前新的均值向量
# 判断当前的均值向量, 如果均值向量没有改变,则保持当前的均值向量,停止迭代
u=self.u
self.updateU()
if u==self.u:
#print('第j 次 停止:')
break
print('新的均值向量:')
print(self.u)
print('0 类:')
print(self.assemblys[0])
print('1 类:')
print(self.assemblys[1])
print('2 类:')
print(self.assemblys[2])
self.assemblys = {0: [], 1: [], 2: []};
print('\n')
return self
#------------------------------------------------调用代码
import os
import numpy as np
import string
import ulti
import matplotlib.pyplot as plt
import math
# 随机选取三个样本作为初始值均向量
# 读取数据源的相关参数
params = { 'path': 'data.txt',
'flag': 'r',
'CiNumber': 3,
'u1': [0.203, 0.337],
'u2': [0.143, 0.199],
'u3': [0.378, 0.473]
};
ulti.cluster().initParam(params).read_datafromfile.culculate
#-----------------------------------------------------
迭代 n-1 ,n-2 次的结果
新的均值向量:
{0: [0.437, 0.323], 1: [0.692, 0.278], 2: [0.8, 0.598]}
0 类:
[['0.245', '0.057'], ['0.343', '0.099'], ['0.360', '0.370'], ['0.359', '0.188'], ['0.339', '0.241'], ['0.282', '0.257'], ['0.483', '0.312'], ['0.478', '0.437'], ['0.525', '0.369'], ['0.532', '0.472'], ['0.473', '0.376'], ['0.446', '0.459'], ['0.403', '0.237'], ['0.481', '0.149'], ['0.437', '0.211'], ['0.243', '0.267']]
1 类:
[['0.639', '0.161'], ['0.657', '0.198'], ['0.593', '0.042'], ['0.719', '0.103'], ['0.748', '0.232'], ['0.714', '0.346'], ['0.774', '0.376'], ['0.634', '0.264'], ['0.608', '0.318'], ['0.556', '0.215'], ['0.666', '0.091']]
2 类:
[['0.751', '0.489'], ['0.752', '0.445'], ['0.697', '0.460']]
新的均值向量:
{0: [0.437, 0.323], 1: [0.692, 0.278], 2: [0.8, 0.598]}
0 类:
[['0.245', '0.057'], ['0.343', '0.099'], ['0.360', '0.370'], ['0.359', '0.188'], ['0.339', '0.241'], ['0.282', '0.257'], ['0.483', '0.312'], ['0.478', '0.437'], ['0.525', '0.369'], ['0.532', '0.472'], ['0.473', '0.376'], ['0.446', '0.459'], ['0.403', '0.237'], ['0.481', '0.149'], ['0.437', '0.211'], ['0.243', '0.267']]
1 类:
[['0.639', '0.161'], ['0.657', '0.198'], ['0.593', '0.042'], ['0.719', '0.103'], ['0.748', '0.232'], ['0.714', '0.346'], ['0.774', '0.376'], ['0.634', '0.264'], ['0.608', '0.318'], ['0.556', '0.215'], ['0.666', '0.091']]
2 类:
[['0.751', '0.489'], ['0.752', '0.445'], ['0.697', '0.460']]
明显可以看到:均值向量没有更新