Sklearn库中Kmeans聚类技术
###数据介绍:
现有1999年全国31个省份城镇居民家庭平均每人全年消费性支出的八个主
要变量数据,这八个变量分别是:食品、衣着、家庭设备用品及服务、医疗
保健、交通和通讯、娱乐教育文化服务、居住以及杂项商品和服务。利用已
有数据,对31个省份进行聚类。
###聚类目的:
通过聚类了解1999年各省份的消费水平在国内的情况
###实现过程:
1、导入相关模块
import numpy as np
from sklearn.cluster import KMeans
2、加载数据并清洗
def loadData(filePath): #定义函数
fr = open(filePath,'r+') #打开文件采用‘r+’的方法
lines = fr.readlines() #读取文件的全部内容
retData = [] #定义数据列表
retCityName = [] #定义城市标签列表
for line in lines:
#对数据进行清洗,去除空格,去除逗号
items = line.strip().split(",")
#将城市标签放入列表,采用append方法
retCityName.append(items[0])
retData.append([float(items[i]) for i in range(1,len(items))]) #将数据放入列表,此处定义一个列表解析式
return retData,retCityName #返回数据
3、主函数编写
if __name__ == '__main__':
data,cityName = loadData('city.txt') #调用以定义的loadData函数
km = KMeans(n_clusters = 3) #进行KMeans聚类,簇数设为3
print(km) #打印聚类函数
label = km.fit_predict(data) #fit_predict():计算簇中心以及为簇分配序号,返回给label标签
expenses = np.sum(km.cluster_centers_,axis = 1) #按行来求和
print(expenses)
#将城市按label分成设定的簇,将每个簇的城市输出,将每个簇的平均花费输出
CityCluster = [[],[],[]] #注意此处定义3个空列表为之前簇数设为3
for i in range(len(cityName)):
CityCluster[label[i]].append(cityName[i])
for i in range(len(CityCluster)):
print("Expenses:%.2f" % expenses[i])
print(CityCluster[i])
###结果如下:
KMeans(algorithm=‘auto’, copy_x=True, init=‘k-means++’, max_iter=300,n_clusters=3, n_init=10, n_jobs=1, precompute_distances=‘auto’,random_state=None, tol=0.0001, verbose=0)
[ 3827.86588235 7754.65666667 5113.54 ]
Expenses:3827.87
[‘河北’, ‘山西’, ‘内蒙古’, ‘辽宁’, ‘吉林’, ‘黑龙江’, ‘安徽’, ‘江西’, ‘山东’, ‘河南’, ‘湖北’, ‘贵州’, ‘陕西’, ‘甘肃’, ‘青海’, ‘宁夏’, ‘新疆’]
Expenses:7754.66
[‘北京’, ‘上海’, ‘广东’]
Expenses:5113.54
[‘天津’, ‘江苏’, ‘浙江’, ‘福建’, ‘湖南’, ‘广西’, ‘海南’, ‘重庆’, ‘四川’, ‘云南’, ‘西藏’]
可看出KMeans将城市按消费水平分为了3类。
###完整代码如下:
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 11 10:04:42 2017
"""
import numpy as np
from sklearn.cluster import KMeans
def loadData(filePath):
fr = open(filePath,'r+')
lines = fr.readlines()
retData = []
retCityName = []
for line in lines:
items = line.strip().split(",")
retCityName.append(items[0])
retData.append([float(items[i]) for i in range(1,len(items))])
return retData,retCityName
if __name__ == '__main__':
data,cityName = loadData('city.txt')
km = KMeans(n_clusters = 3)
print(km)
label = km.fit_predict(data)
expenses = np.sum(km.cluster_centers_,axis = 1)
print(expenses)
CityCluster = [[],[],[]]
for i in range(len(cityName)):
CityCluster[label[i]].append(cityName[i])
for i in range(len(CityCluster)):
print("Expenses:%.2f" % expenses[i])
print(CityCluster[i])