# K-means 聚类算法
# 案例:通过聚类,了解1999年各个省份的消费水平在国内的情况。
# 技术路线:sklearn.cluster.Kmeans
学习参考来源:MOOC-Python机器学习应用-CH1聚类-KMeans算法。
附课程链接 https://www.icourse163.org/learn/BIT-1001872001?tid=1001965001#/learn/content?type=detail&id=1002854139&cid=1003246089
# 1、建立工程,导入sklearn相关包
import numpy as np
from sklearn.cluster import KMeans
#看看原始数据长啥样先
dataset = np.loadtxt(r"D:/2018_BigData/Python/Python_Book/3MOOC_MachineLearningInPython/Cluster/31city.txt",
str,delimiter=",")
dataset
array([['北京', '2959.19', '730.79', '749.41', '513.34', '467.87',
'1141.82', '478.42', '457.64'],
['天津', '2459.77', '495.47', '697.33', '302.87', '284.19',
'735.97', '570.84', '305.08'],
['河北', '1495.63', '515.90', '362.37', '285.32', '272.95',
'540.58', '364.91', '188.63'],
['山西', '1406.33', '477.77', '290.15', '208.57', '201.50',
'414.72', '281.84', '212.10'],
['内蒙古', '1303.97', '524.29', '254.83', '192.17', '249.81',
'463.09', '287.87', '192.96'],
['辽宁', '1730.84', '553.90', '246.91', '279.81', '239.18',
'445.20', '330.24', '163.86'],
['吉林', '1561.86', '492.42', '200.49', '218.36', '220.69',
'459.62', '360.48', '147.76'],
['黑龙江', '1410.11', '510.71', '211.88', '277.11', '224.65',
'376.82', '317.61', '152.85'],
['上海', '3712.31', '550.74', '893.37', '346.93', '527.00',
'1034.98', '720.33', '462.03'],
['江苏', '2207.58', '449.37', '572.40', '211.92', '302.09',
'585.23', '429.77', '252.54'],
['浙江', '2629.16', '557.32', '689.73', '435.69', '514.66',
'795.87', '575.76', '323.36'],
['安徽', '1844.78', '430.29', '271.28', '126.33', '250.56',
'513.18', '314.00', '151.39'],
['福建', '2709.46', '428.11', '334.12', '160.77', '405.14',
'461.67', '535.13', '232.29'],
['江西', '1563.78', '303.65', '233.81', '107.90', '209.70',
'393.99', '509.39', '160.12'],
['山东', '1675.75', '613.32', '550.71', '219.79', '272.59',
'599.43', '371.62', '211.84'],
['河南', '1427.65', '431.79', '288.55', '208.14', '217.00',
'337.76', '421.31', '165.32'],
['湖南', '1942.23', '512.27', '401.39', '206.06', '321.29',
'697.22', '492.60', '226.45'],
['湖北', '1783.43', '511.88', '282.84', '201.01', '237.60',
'617.74', '523.52', '182.52'],
['广东', '3055.17', '353.23', '564.56', '356.27', '811.88',
'873.06', '1082.82', '420.81'],
['广西', '2033.87', '300.82', '338.65', '157.78', '329.06',
'621.74', '587.02', '218.27'],
['海南', '2057.86', '186.44', '202.72', '171.79', '329.65',
'477.17', '312.93', '279.19'],
['重庆', '2303.29', '589.99', '516.21', '236.55', '403.92',
'730.05', '438.41', '225.80'],
['四川', '1974.28', '507.76', '344.79', '203.21', '240.24',
'575.10', '430.36', '223.46'],
['贵州', '1673.82', '437.75', '461.61', '153.32', '254.66',
'445.59', '346.11', '191.48'],
['云南', '2194.25', '537.01', '369.07', '249.54', '290.84',
'561.91', '407.70', '330.95'],
['西藏', '2646.61', '839.70', '204.44', '209.11', '379.30',
'371.04', '269.59', '389.33'],
['陕西', '1472.95', '390.89', '447.95', '259.51', '230.61',
'490.90', '469.10', '191.34'],
['甘肃', '1525.57', '472.98', '328.90', '219.86', '206.65',
'449.69', '249.66', '228.19'],
['青海', '1654.69', '437.77', '258.78', '303.00', '244.93',
'479.53', '288.56', '236.51'],
['宁夏', '1375.46', '480.89', '273.84', '317.32', '251.08',
'424.75', '228.73', '195.93'],
['新疆', '1608.82', '536.05', '432.46', '235.82', '250.28',
'541.30', '344.85', '214.40']], dtype='<U7')
# 定义loadData函数
def loadData(filePath):
fr = open(filePath,"r+")
lines = fr.readlines()
retData = [] #用来存储城市的各项消费信息
retCityName = [] #用来存储城市名称
for line in lines:
items = line.strip().split(",")
retCityName.append(items[0])
retData.append([float(items[i]) for i in range(1,len(items))])
for i in range(1,len(items)):
return retData,retCityName #返回值:返回城市名称,以及该城市的各项消费信息
# 2、加载数据(loadData函数),创建K-means算法实例,并进行训练,获得标签:
if __name__ == '__main__':
data,cityName = loadData(r"D:/2018_BigData/Python/Python_Book/3MOOC_MachineLearningInPython/Cluster/31city.txt")
km = KMeans(n_clusters=3)
label = km.fit_predict(data) #调用Kmeans()fit_predict()方法进行计算
expenses = np.sum(km.cluster_centers_,axis=1)
#print(expenses)
CityCluster = [[],[],[]]
for i in range(len(cityName)): #将城市按label分成设定的簇
CityCluster[label[i]].append(cityName[i]) #将每个簇的城市输出
for i in range(len(CityCluster)):
print("Expenses:%.2f" %expenses[i]) #将每个簇的平均花费输出
print(CityCluster[i])
Expenses:3827.87
['河北', '山西', '内蒙古', '辽宁', '吉林', '黑龙江', '安徽', '江西', '山东', '河南', '湖北', '贵州', '陕西', '甘肃', '青海', '宁夏', '新疆']
Expenses:7754.66
['北京', '上海', '广东']
Expenses:5113.54
['天津', '江苏', '浙江', '福建', '湖南', '广西', '海南', '重庆', '四川', '云南', '西藏']
# 3、输出标签,查看结果
# 将城市按照消费水平n_clusters类,消费水平相近的城市聚集在一类中
# expense:聚类中心点的数值加和,也就是平均消费水平
# 聚成2类:km=Kmeans(n_clusters=2)
if __name__ == '__main__':
data,cityName = loadData(r"D:/2018_BigData/Python/Python_Book/3MOOC_MachineLearningInPython/Cluster/31city.txt")
km = KMeans(n_clusters=2)
label = km.fit_predict(data) #调用Kmeans()fit_predict()方法进行计算
expenses = np.sum(km.cluster_centers_,axis=1)
#print(expenses)
CityCluster = [[],[]]
for i in range(len(cityName)): #将城市按label分成设定的簇
CityCluster[label[i]].append(cityName[i]) #将每个簇的城市输出
for i in range(len(CityCluster)):
print("Expenses:%.2f" %expenses[i]) #将每个簇的平均花费输出
print(CityCluster[i])
Expenses:4040.42
['河北', '山西', '内蒙古', '辽宁', '吉林', '黑龙江', '江苏', '安徽', '江西', '山东', '河南', '湖南', '湖北', '广西', '海南', '四川', '贵州', '云南', '陕西', '甘肃', '青海', '宁夏', '新疆']
Expenses:6457.13
['北京', '天津', '上海', '浙江', '福建', '广东', '重庆', '西藏']
# 聚成4类:km=Kmeans(n_clusters=4)
if __name__ == '__main__':
data,cityName = loadData(r"D:/2018_BigData/Python/Python_Book/3MOOC_MachineLearningInPython/Cluster/31city.txt")
km = KMeans(n_clusters=4)
label = km.fit_predict(data) #调用Kmeans()fit_predict()方法进行计算
expenses = np.sum(km.cluster_centers_,axis=1)
#print(expenses)
CityCluster = [[],[],[],[]]
for i in range(len(cityName)): #将城市按label分成设定的簇
CityCluster[label[i]].append(cityName[i]) #将每个簇的城市输出
for i in range(len(CityCluster)):
print("Expenses:%.2f" %expenses[i]) #将每个簇的平均花费输出
print(CityCluster[i])
Expenses:5678.62
['天津', '浙江', '福建', '重庆', '西藏']
Expenses:3788.76
['河北', '山西', '内蒙古', '辽宁', '吉林', '黑龙江', '江西', '山东', '河南', '贵州', '陕西', '甘肃', '青海', '宁夏', '新疆']
Expenses:7754.66
['北京', '上海', '广东']
Expenses:4512.27
['江苏', '安徽', '湖南', '湖北', '广西', '海南', '四川', '云南']
# 从结果可以看出消费水平相近的省市聚集在了一类,例如消费最高的“北京”“上海”“广东”,聚集在了消费最高的类别。
# 聚4类时,结果可以比较明显地看出消费层级。