#################################################
# kmeans: k-means cluster
# Email : jtailong@163.com
#################################################
from sklearn.cluster import KMeans
import numpy
import matplotlib.pyplot as plt
# step 1: load data
print('step 1: load data...')
# 读取testSet.txt数据并存储到dataSet中
dataSet = []
fileIn = open('D:/P/test.txt')
for line in fileIn.readlines():
lineArr = line.strip().split(',')
dataSet.append('%0.2f' % float(lineArr[0]))
dataSet.append('%0.2f' % float(lineArr[1]))
dataSet.append('%0.2f' % float(lineArr[2]))
# step 2: clustering...
print('step 2: clustering...')
# 调用sklearn.cluster中的KMeans类
print(dataSet)
dataSet = numpy.array(dataSet).reshape(5000, 3)
kmeans = KMeans(n_clusters=3, random_state=0).fit(dataSet) # n_cluster的值为分几类
# 求出聚类中心
center = kmeans.cluster_centers_
center_x = []
center_y = []
print(center)
for i in range(len(center)):
center_x.append('%0.6f' % center[i][0])
center_y.append('%0.6f' % center[i][1])
# center_y.append('%0.6f' % center[i][2])
# 标注每个点的聚类结果
labels = kmeans.labels_
type1_x = []
type1_y = []
type2_x = []
type2_y = []
type3_x = []
type3_y = []
# type4_x = []
# type4_y = []
for i in range(len(labels)):
if labels[i] == 0:
type1_x.append(dataSet[i][0])
type1_y.append(dataSet[i][1])
if labels[i] == 1:
type2_x.append(dataSet[i][0])
type2_y.append(dataSet[i][1])
if labels[i] == 2:
type3_x.append(dataSet[i][0])
type3_y.append(dataSet[i][1])
# if labels == 3:
# type4_x.append(dataSet[0])
# type4_y.append(dataSet[1])
print(labels)
# 画出四类数据点及聚类中心
plt.figure(figsize=(8, 6), dpi=80) # 图片大小和分辨率
axes = plt.subplot(111)
type1 = axes.scatter(type1_x, type1_y, s=40, c='red')
type2 = axes.scatter(type2_x, type2_y, s=40, c='green')
type3 = axes.scatter(type3_x, type3_y, s=40, c='pink')
type_center = axes.scatter(center_x, center_y, s=40, c='blue')
plt.xlabel('x')
plt.ylabel('y')
axes.legend((type1, type2, type3, type_center), ('0', '1', '2', 'center'), loc=1)
plt.show()
这个可以实现。将数据文件转化为如下格式:
用TXT方式打开.data文件后,将数据存至.txt中保存,运行,就可以实现图。
问题:如何每一类分别取100个?关于分类的数学意义,可以参考下面链接:http://www.cnblogs.com/jerrylead/archive/2011/04/06/2006910.html