二分kmeans python实现

今天要对一个1000个个记录,每个记录有n个属性的文本进行聚类,采用的是二分k均值方法。

算法思想:

我参考了Pang-Ning Tan数据挖掘导论里P317

相对于kmeans的优点是不受其初始质心的影响。

#coding utf-8
#python 3.4
#2015-4-3
#Fitz Yin
#yinruyi.hm@gmail.com
from
sklearn.cluster import KMeans import numpy as np def makedict(f): #建立行号和每行数据间的字典关系 a = [line.split() for line in f] data_dict = {} for i in range(len(a)): data_dict[i] = a[i] return data_dict def kmeans(data): #kmeans算法 data = np.array(data) computer=KMeans(n_clusters=2) computer.fit(data) labels = computer.labels_ one_class = [] zero_class = [] for i in range(len(labels)): if labels[i] == 1: one_class.append(i)#0类的行号 else: zero_class.append(i)#1类的行号 centers = computer.cluster_centers_#找到中心 cohesion_0,cohesion_1 = -1,-1#初始化,自己和自己的cos是1 for i in zero_class: cohesion_0 += judge_cos(data[i],centers[0])#0类cos评价 for i in one_class: cohesion_1 += judge_cos(data[i],centers[1])#1类cos评价 return zero_class,one_class,cohesion_0,cohesion_1 def judge_cos(x,y): #cos评价函数 af,bf,ab = 0,0,0 for i in range(len(x)): af = float(x[i])*float(x[i]) bf = float(y[i])*float(y[i]) ab = float(x[i])*float(y[i]) if af == 0 or bf == 0: print('error') return 0 #本例中不出现全是0情况 else: cos_value = ab/(np.sqrt(af)*np.sqrt(bf)) return cos_value def gettransdict(split_set,split_number): #建立kmeans计算的矩阵和原来矩阵 两个行号之间的字典关系 a = split_set[split_number][0] transdict = {} for i in range(len(a)): transdict[i] = a[i] return transdict def getsplitset(split_set,split_number): #簇中去掉要分的簇 new_split_set = [] for i in range(len(split_set)): if i == split_number: pass else: new_split_set.append(split_set[i]) return new_split_set def getsplitnumber(split_set): #找寻待分簇的编号 split_number = 0 temp = [] for i in range(len(split_set)): temp.append(split_set[i][1]) for i in range(len(temp)): if temp[split_number] < temp[i]: split_number = i return split_number def main(): f = open('train.txt','r',encoding='utf-8').readlines() data_dict = makedict(f) k = 3#分类个数 #sse = 0.001 split_set = [[[i for i in range(1000)],0]]#此处1000是行号 split_number = 0#需要分类的簇标号 while len(split_set) != k: transdict = gettransdict(split_set,split_number)#转换字典 array2kmeans = [data_dict[i] for i in split_set[split_number][0]]#获取二分kmeans计算矩阵 zero_class,one_class,cohesion_0,cohesion_1 = kmeans(array2kmeans) real_zero_class = [transdict[i] for i in zero_class]#分裂后的簇0 real_one_class = [transdict[i] for i in one_class]#分裂后的簇1 split_set = getsplitset(split_set,split_number)#将总的簇中去掉分的大的簇 split_set.append([real_zero_class,cohesion_0]) split_set.append([real_one_class,cohesion_1])#总的簇中加入分完的小簇 split_number = getsplitnumber(split_set)#获取下一个循环待分的簇编号 print(split_set) #[[[行号1类],sse1],[[行号2类],sse2],[[行号三类],sse3]] if __name__ == '__main__': main()

 

转载于:https://www.cnblogs.com/yinruyi/p/4390918.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值