调用sklearn.MeanShift聚类算法的测试
#导入相关的模块
from sklearn.cluster import KMeans
from sklearn.externals import joblib
import numpy, time
import matplotlib.pyplot as plt
from sklearn.cluster import MeanShift, estimate_bandwidth
import numpy as np
if __name__=='__main__':
#加载数据
print ('step1:load data')
dataset=[]
fileIn = open(r'E:\项目数据\test_mean_shift.txt')
for line in fileIn.readlines():
lineArr=line.strip().split(' ')
#print(lineArr[0])
dataset.append([float(lineArr[0]),float(lineArr[1])])
numSamples=len(dataset) #得到样本数据的数量
print(numSamples)
X=np.array(dataset) #列表类型转换为数组类型
#print(X)
bandwidth=estimate_bandwidth(X, quantile=0.2, n_samples=500) #定义以某个点为核心时搜索半径
#下一步是训练数据,并且将变量clf实例化
clf=MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=True).fit(X)
centroids=clf.labels_
print(centroids, type(centroids)) #显示每个点聚类归属
#计算自动生成的K,将聚类数量小于3的排除
arr_flag=[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
for i in clf.labels_:
arr_flag[i]=arr_flag[i]+1
print(arr_flag)
k=0
for i in arr_flag:
if (i>3):
k+=1
print(k)
#下一步是构建颜色和样式的列表,for循环中根据每个元素的index选取对应的元素
mark=['or','ob','og','ok','^r','+r','sr','dr','<r','pr']
#print(mark[clf.labels_[1]])
#画出所有样例点,属于同一分类的绘制同样颜色
for i in range(numSamples):
plt.plot(dataset[i][0],dataset[i][1],mark[clf.labels_[i]]) #第一个是每个点横坐标,第二个纵坐标,第三个是点的样式
#下一步重新构建mark列表来定义质心颜色和样式
mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
# 画出质点,用特殊图型
centroids = clf.cluster_centers_
for i in range(k):
plt.plot(centroids[i][0], centroids[i][1], mark[i], markersize = 12)
print (centroids) #显示中心点坐标
plt.show()
结果展示: