Python:
import numpy as np
import matplotlib.pylab as plt
from math import sqrt
from sklearn import datasets
class Kmeans():
"""python写的简单的K均值算法"""
def __init__(self,data,k):
"""加载数据,鸢尾花数据集"""
self.data = data
self.k=k
def distEclud(self,v1,v2):
"""计算距离"""
return sqrt(sum(pow(v1-v2,2)))
def randCent(self):
"""构建初始质心"""
m,n=np.shape(self.data)
centroids=np.zeros((self.k,n))
for i in range(self.k):
index=int(np.random.uniform(0,m))#从0开始到m行随机采样
centroids[i,:]=self.data[index,:]
return centroids#初始质心
def km(self):
"""计算机k均值算法"""
m=np.shape(self.data)[0]#数据集有多少行
clusterAssment=np.array(np.zeros((m,2)))
clusterChange=True#标志值,等于True时代表还得继续分组
centroids=self.randCent()#获得初始质心
while clusterChange:
clusterChange=False
for i in range(m):
minDist=99999999
minIndex=-1
for j in range(self.k):
distance=self.distEclud(centroids[j,:],self.data[i,:])
if distance<minDist:
minDist=distance
minIndex=j
if clusterAssment[i,0]!=minIndex:#如果元素不在最近的那个簇里 就更新每一个元素样本所属的簇
clusterChange=True
clusterAssment[i,:]=minIndex,minDist**2
for j in range(self.k):#计算新的质心(求每个簇内每列的平均值)
pointsInCluster = self.data[clusterAssment[:, 0] == j,:]
centroids[j,:]=np.mean(pointsInCluster,axis=0)
return centroids,clusterAssment
def ptlshow(self,k):
m=np.shape(self.data)[0]
colormap = np.array(['red', 'lime', 'black'])
plt.scatter(self.data[:,0],self.data[:,1],c="r")
plt.show()#聚类之前
centroids,clusterAssment=self.km()
mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
for i in range(m):
markIndex = int(clusterAssment[i, 0])
plt.plot(self.data[i, 0], self.data[i, 1], mark[markIndex])
mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
for i in range(k):
plt.plot(centroids[i, 0], centroids[i, 1], mark[i])
plt.grid(color='#95a5a6', linestyle='--', linewidth=1, axis='both', alpha=0.4)
plt.show()
print(clusterAssment,m,sum(clusterAssment[:,1]),sum(clusterAssment[:,1])/m)
data=datasets.load_iris()
#data_1=data["data"][:,:2]#sepal length sepal width
#data_2=data["data"][:,2:]#petal length petal width
data_3=data["data"][:,(0,2)]
k=Kmeans(data_3,3)
k.km()
k.ptlshow(3)
"""
k=Kmeans(data_1,3)
k.km()
k.ptlshow(3)
k=Kmeans(data_2,3)
k.km()
k.ptlshow(3)
print("特征名字:",data["feature_names"])
print("特征值:",data["target"])
print("目标值的名字:",data["target_names"])
"""
Sklearn:
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.cluster import KMeans
import sklearn.metrics as sm
import pandas as pd
import numpy as np
class Kmeans():
def __init__(self,data):
self.data=data
self.print_data("数据:", self.data["data"])
self.print_data("特征名字:",self.data["feature_names"])
self.print_data("目标值:",self.data["target"])
self.print_data("目标值得名字:",self.data["target_names"])
self.data = pd.DataFrame(self.data["data"]) # 将data数据转换为pandas库数据库结构,并设立列的名字
self.data.columns = ['Sepal_Length', 'Sepal_Width', 'Petal_Length',
'Petal_Width'] # 将数据的名字设为‘Sepal_Length’,‘Sepal_Width’,‘Sepal_Width’,‘Petal_Width’
# 将target数据转换为pandas库数据库结构
self.y = pd.DataFrame(data["target"])
self.y.columns = ['Targets']#改变特征字段名字
#创建色板
self.colormap=np.array(["red","lime","black"])
def print_data(self,str,data):
print(str,data)
print("="*85)
def show_data(self,x,y,title):
plt.figure(figsize=(14,7))#创建画布
plt.scatter(x,y,c=self.colormap[self.y.Targets],s=40)
plt.title(title)
plt.show()
def km(self):
# 开始画Sepal长度和宽度的关系
self.show_data(self.data.Sepal_Length, self.data.Sepal_Width, title='Sepal')
# 开始画Petal长度和宽度的关系
self.show_data(self.data.Petal_Length, self.data.Petal_Width, title='Petal')
# 开始画聚类前Petal长度和Sepal长度的关系
self.show_data(self.data.Sepal_Length, self.data.Petal_Length, title='Lenght')
X=self.data.ix[:,['Sepal_Length', 'Petal_Length']]#设置要聚类的字段
estimator=KMeans(n_clusters=3)
estimator.fit(X)
label_pred=estimator.labels_
x0=self.data[label_pred==0]#0类
x1=self.data[label_pred==1]#1类
x2=self.data[label_pred==2]#2类
plt.rc("font",family="STXihei",size=10)
plt.scatter(x0['Sepal_Length'], x0['Petal_Length'], 50, color='red', marker='+', linewidth=2, alpha=0.8)
plt.scatter(x1['Sepal_Length'], x1['Petal_Length'], 50, color='yellow', marker='+', linewidth=2, alpha=0.8)
plt.scatter(x2['Sepal_Length'], x2['Petal_Length'], 50, color='blue', marker='+', linewidth=2, alpha=0.8)
plt.xlabel('Sepal_Length')
plt.ylabel('Sepal_Width')
plt.xlim(4, 10)
plt.grid(color='#95a5a6', linestyle='--', linewidth=1, axis='both', alpha=0.4)
plt.show()
# 计算并输出准确率
print('the accuracy is:', sm.accuracy_score(self.y, estimator.labels_))
data=datasets.load_iris()#加载数据
km=Kmeans(data)
km.km()