一、实验目的
理解 K-means 聚类算法的基本原理
学会用 python/matlab 实现 K-means 算法
二、实验内容
- 随机生成 500 个数,并对这 500 个数进行 k-mean 聚类(k=3,4,5,6),(并用 matplot/plot 画 图)
1) 随机创建 500 个样本的二维数据作为训练集;
2) k=3 进行聚类,并观察簇分布;
3) k=4 进行聚类,并观察簇分布;
4) k=5 进行聚类,并观察簇分布;
5) k=6 进行聚类,并观察簇分布; - 针对上传的数据集:sales_data,consumption_data;bankloan;arima_data;iris 等 5 个数据集,任意选择 2-3 个数据集,进行聚类分析实验。
三、实验代码
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
import numpy.matlib
#字体设置
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def distEclud(x,y):
return np.sqrt(np.sum(pow((x-y),2))) # 计算欧氏距离
def randCent(dataSet,k):
m,n = dataSet.shape
centroids = np.zeros((k,n))#初始化质心的值
for i in range(k):# 从数据集中随机选择k个点作为质心centroid初始值
index = int(np.random.uniform(0,m))
centroids[i,:] = dataSet.values[index,:]
return centroids
def KMeans(dataSet,k):
m = dataSet.shape[0] #行的数目
# 第一列存样本属于哪一簇,第二列存样本的到簇的中心点的误差
clusterAssment = np.mat(np.zeros((m,2)))#矩阵初始化
clusterChange = True
centroids = randCent(dataSet,k)# 初始化centroids
while clusterChange:
clusterChange = False
for i in range(m):# 遍历所有的样本(行数)
minDist = float('inf')#初始化最小距离
minIndex = -1#初始化类标
for j in range(k):# 遍历所有的质心,找出最近的质心
# 计算该样本到质心的欧式距离
distance = distEclud(centroids[j,:],dataSet.values[i,:])
if distance < minDist:
minDist = distance
minIndex = j
if clusterAssment[i,0] != minIndex:# 更新每一个样本所属的簇
clusterChange = True
clusterAssment[i,:] = minIndex,minDist
for j in range(k):#所有样本遍历完之后,更新质心
pointsInCluster = dataSet.values[np.nonzero(np.array(clusterAssment[:,0]) == j)[0]] # 获取j簇类所有的点
centroids[j,:] = np.mean(pointsInCluster,axis=0) # 对所有点各列求均值,作为质心的最新坐标
return centroids,clusterAssment
#随机
x = np.random.randint(0,1000,size=500)
y = np.random.randint(0,1000,size=500)
randintnum=pd.concat([pd.DataFrame(x), pd.DataFrame(y)],axis = 1,ignore_index = True)
print(randintnum)
plt.figure(1)
plt.subplot(2, 2, 1)
plt.title('k=3')
randintnum_test, randintnum_result = KMeans(randintnum,3)
randintnum_result=np.array(randintnum_result)
plt.scatter(randintnum_test[:,0],randintnum_test[:,1],s=100,color = 'red',marker = 'x')
plt.scatter(randintnum.values[:,0],randintnum.values[:,1],c=randintnum_result[:,0],alpha=0.5)
plt.subplot(2, 2, 2)
plt.title('k=4')
randintnum_test, randintnum_result = KMeans(randintnum,4)
randintnum_result=np.array(randintnum_result)
plt.scatter(randintnum_test[:,0],randintnum_test[:,1],s=100,color = 'red',marker = 'x')
plt.scatter(randintnum.values[:,0],randintnum.values[:,1],c=randintnum_result[:,0],alpha=0.5)
plt.subplot(2, 2, 3)
plt.title('k=5')
randintnum_test, randintnum_result = KMeans(randintnum,5)
randintnum_result=np.array(randintnum_result)
plt.scatter(randintnum_test[:,0],randintnum_test[:,1],s=100,color = 'red',marker = 'x')
plt.scatter(randintnum.values[:,0],randintnum.values[:,1],c=randintnum_result[:,0],alpha=0.5)
plt.subplot(2, 2, 4)
plt.title('k=6')
randintnum_test, randintnum_result = KMeans(randintnum,6)
randintnum_result=np.array(randintnum_result)
plt.scatter(randintnum_test[:,0],randintnum_test[:,1],s=100,color = 'red',marker = 'x')
plt.scatter(randintnum.values[:,0],randintnum.values[:,1],c=randintnum_result[:,0],alpha=0.5)
plt.show()
#iris.txt
data = pd.DataFrame(np.genfromtxt("iris.txt",dtype=[float,float,float,float,float]))
group=data.values[:,-1]
data=data.drop(data.columns[-1],axis=1)
print(data)
iris_cent,iris_result = KMeans(data, 3)
print(iris_cent)
iris_result=np.array(iris_result)
plt.figure(1)
ax = plt.subplot(1,4,1)
plt.title('第1、2特征预测')
ax.scatter(iris_cent[:,0],iris_cent[:,1],s=100,color = 'red',marker = 'x')
ax.scatter(data.values[:,0],data.values[:,1],c=iris_result[:,0],alpha=0.5)
ax = plt.subplot(1,4,2)
plt.title('第1、2特征真实')
ax.scatter(iris_cent[:,0],iris_cent[:,1],s=100,color = 'red',marker = 'x')
ax.scatter(data.values[:,0],data.values[:,1],c=group,alpha=0.5)
ax = plt.subplot(1,4,3)
plt.title('第2、3特征预测')
ax.scatter(iris_cent[:,1],iris_cent[:,2],s=100,color = 'red',marker = 'x')
ax.scatter(data.values[:,1],data.values[:,2],c=iris_result[:,0],alpha=0.5)
ax = plt.subplot(1,4,4)
plt.title('第2、3特征预测')
ax.scatter(iris_cent[:,1],iris_cent[:,2],s=100,color = 'red',marker = 'x')
ax.scatter(data.values[:,1],data.values[:,2],c=group,alpha=0.5)
plt.show()
#bankloan.xls 2
data=pd.DataFrame(pd.read_excel('bankloan.xls',header=0))
group=data.values[:,-1]
data=data.drop(['违约'],axis=1)
print(data)
bankloan_cent,bankloan_result = KMeans(data, 2)
bankloan_result=np.array(bankloan_result)
print(bankloan_cent)
print(bankloan_result)
plt.figure(1)
ax = plt.subplot(2,2,1)
plt.title('负债率与信用卡负债预测')
ax.scatter(bankloan_cent[:,5],bankloan_cent[:,6],s=100,color = 'red',marker = 'x')
ax.scatter(data.values[:,5],data.values[:,6],s=20,c=bankloan_result[:,0],alpha=0.5)
ax = plt.subplot(2,2,2)
plt.title('负债率与信用卡负债真实')
ax.scatter(bankloan_cent[:,5],bankloan_cent[:,6],s=100,color = 'red',marker = 'x')
ax.scatter(data.values[:,5],data.values[:,6],s=20,c=group,alpha=0.5)
ax = plt.subplot(2,2,3)
plt.title('信用卡负债与其他负债预测')
ax.scatter(bankloan_cent[:,6],bankloan_cent[:,7],s=100,color = 'red',marker = 'x')
ax.scatter(data.values[:,6],data.values[:,7],s=20,c=bankloan_result[:,0],alpha=0.5)
ax = plt.subplot(2,2,4)
plt.title('信用卡负债与其他负债真实')
ax.scatter(bankloan_cent[:,6],bankloan_cent[:,7],s=100,color = 'red',marker = 'x')
ax.scatter(data.values[:,6],data.values[:,7],s=20,c=group,alpha=0.5)
plt.show()
#sale_data 2
olderr = np.seterr(all='ignore')
data=pd.DataFrame(pd.read_excel('sale_data.xls',header=0))
data["销量"] = pd.Categorical(data["销量"]).codes
group=data.values[:,-1]
print(group)
data=data.drop(["序号"],axis=1)
data=data.drop(["销量"],axis=1)
data["天气"] = pd.Categorical(data["天气"]).codes
data["是否周末"] = pd.Categorical(data["是否周末"]).codes
data["是否有促销"] = pd.Categorical(data["是否有促销"]).codes
print(data)
sale_data_cent,sale_data_result = KMeans(data, 2)
sale_data_result=np.array(sale_data_result)
print(sale_data_result)
plt.figure(1)
ax = plt.subplot(1,2,1,projection='3d')
ax.scatter(sale_data_cent[:,0],sale_data_cent[:,1],sale_data_cent[:,2],s=100,color = 'red',marker = 'x')
ax.scatter(data.values[:,0],data.values[:,1],data.values[:,2],c=sale_data_result[:,0],alpha=0.5)
ax = plt.subplot(1,2,2,projection='3d')
ax.scatter(sale_data_cent[:,0],sale_data_cent[:,1],sale_data_cent[:,2],s=100,color = 'red',marker = 'x')
ax.scatter(data.values[:,0],data.values[:,1],data.values[:,2],c=group,alpha=0.5)
plt.show()
#consumption_data
data=pd.DataFrame(pd.read_excel('consumption_data.xls',header=0))
data=data.drop(["Id"],axis=1)
print(data)
consumption_data_cent,consumption_data_result = KMeans(data, 3)
print(consumption_data_cent)
consumption_data_result=np.array(consumption_data_result)
print(consumption_data_result)
plt.figure(1)
ax = plt.subplot(1,3,1)
plt.title('R&F')
ax.scatter(consumption_data_cent[:,0],consumption_data_cent[:,1],s=100,color = 'red',marker = 'x')
ax.scatter(data.values[:,0],data.values[:,1],c=consumption_data_result[:,0],alpha=0.5)
ax = plt.subplot(1,3,2)
plt.title('R&M')
ax.scatter(consumption_data_cent[:,0],consumption_data_cent[:,2],s=100,color = 'red',marker = 'x')
ax.scatter(data.values[:,0],data.values[:,2],c=consumption_data_result[:,0],alpha=0.5)
ax = plt.subplot(1,3,3)
plt.title('F&M')
ax.scatter(consumption_data_cent[:,1],consumption_data_cent[:,2],s=100,color = 'red',marker = 'x')
ax.scatter(data.values[:,1],data.values[:,2],c=consumption_data_result[:,0],alpha=0.5)
plt.show()
#arima_data
data=pd.DataFrame(pd.read_excel('arima_data.xls',header=0))
data["日期"] = pd.Categorical(data["日期"]).codes
print(data)
arima_data_cent,arima_data_result = KMeans(data, 3)
arima_data_result=np.array(arima_data_result)
print(arima_data_cent)
print(arima_data_result)
ax = plt.subplot()
ax.scatter(arima_data_cent[:,0],arima_data_cent[:,1],s=100,color = 'red',marker = 'x')
ax.scatter(data.values[:,0],data.values[:,1],c=arima_data_result[:,0],alpha=0.5)
plt.show()
四、实验结果与分析
随机生成 500 个数,并对这 500 个数进行 k-mean 聚类(k=3,4,5,6):
Iris.txt:
bankloan.xls:
sale_data:
consumption_data:
arima_data:
通过上述各类数据集的运行结果来看,k-means更加适合同一聚类中的对象特征相似度高、簇间区别明显的数据集,特征值过多、过少都会影响k-means的效果,噪点对其影响也比较大。
数据集
链接:https://pan.baidu.com/s/164xnXirXaD3gJHEctL5fSA
提取码:zxcv