K-means和K-means++主要区别在于,K-means++算法选择初始类中心时,尽可能选择相距较远的类中心,而K-means仅仅是随机初始化类中心。
#K-means算法
from pylab import *
from numpy import *
import codecs
import matplotlib.pyplot as plt
data=[]
labels=[]
#数据读取
with codecs.open("data.txt","r") as f:
for line in f.readlines():
x,y,label=line.strip().split('\t')
data.append([float(x),float(y)])
labels.append(float(label))
datas=array(data)
k=3#聚类数目
#计算欧式距离
def distance(x1,x2):
return sqrt(sum(power(x1-x2,2)))
#随机初始化类中心
def randcenter(set,k):
dim=shape(set)[1]
init_cen=zeros((k,dim))
for i in range(dim):
min_i=min(set[:,i])
range_i=float(max(set[:,i]) - min_i)
init_cen[:,i]=min_i + range_i*random.rand(k)
return init_cen
#主程序
def Kmeans(dataset,k):
row_m=shape(dataset)[0]
cluster_assign=zeros((row_m,2))
center=get_centroids(dataset,k)
change=True
while change:
change=False
for i in range(row_m):
mindist=inf
min_index=-1
for j in range(k):
distance1=distance(center[j,:],dataset[i,:])
if distance1<mindist:
mindist=distance1
min_index=j
if cluster_assign[i,0] != min_index:
change=True
cluster_assign[i,:]=min_index,mindist**2
for cen in range(k):
cluster_data=dataset[nonzero(cluster_assign[:,0]==cen)]
center[cen,:]=mean(cluster_data,0)
return center ,cluster_assign
cluster_center,cluster_assign=Kmeans(datas,k)
print(cluster_center)
#设置x,y轴的范围
xlim(0, 10)
ylim(0, 10)
#做散点图
f1 = plt.figure(1)
plt.scatter(datas[nonzero(cluster_assign[:,0]==0),0],datas[nonzero(cluster_assign[:,0]==0),1],marker='o',color='r',label='0',s=30)
plt.scatter(datas[nonzero(cluster_assign[:,0]==1),0],datas[nonzero(cluster_assign[:,0]==1),1],marker='+',color='b',label='1',s=30)
plt.scatter(datas[nonzero(cluster_assign[:,0]==2),0],datas[nonzero(cluster_assign[:,0]==2),1],marker='*',color='g',label='2',s=30)
plt.scatter(cluster_center[:,1],cluster_center[:,0],marker = 'x', color = 'm', s = 50)
plt.show()
K-means运行结果:
类中心:
[[ 7.16504475 7.12121176]
[ 2.94805141 2.84547461]
[ 4.92859254 4.93144926]]
#K-means++
from pylab import *
from numpy import *
import codecs
import matplotlib.pyplot as plt
data=[]
labels=[]
#数据读取
with codecs.open("data.txt","r") as f:
for line in f.readlines():
x,y,label=line.strip().split('\t')
data.append([float(x),float(y)])
labels.append(float(label))
datas=array(data)
#计算欧氏距离
def distance(x1,x2):
return sqrt(sum(power(x1-x2,2)))
#对一个样本找到与该样本距离最近的聚类中心
def nearest(point, cluster_centers):
min_dist = inf
m = np.shape(cluster_centers)[0] # 当前已经初始化的聚类中心的个数
for i in range(m):
# 计算point与每个聚类中心之间的距离
d = distance(point, cluster_centers[i, ])
# 选择最短距离
if min_dist > d:
min_dist = d
return min_dist
#选择尽可能相距较远的类中心
def get_centroids(dataset, k):
m, n = np.shape(dataset)
cluster_centers = np.zeros((k , n))
index = np.random.randint(0, m)
cluster_centers[0,] = dataset[index, ]
# 2、初始化一个距离的序列
d = [0.0 for _ in range(m)]
for i in range(1, k):
sum_all = 0
for j in range(m):
# 3、对每一个样本找到最近的聚类中心点
d[j] = nearest(dataset[j, ], cluster_centers[0:i, ])
# 4、将所有的最短距离相加
sum_all += d[j]
# 5、取得sum_all之间的随机值
sum_all *= random.rand()
# 6、获得距离最远的样本点作为聚类中心点
for j, di in enumerate(d):
sum_all=sum_all - di
if sum_all > 0:
continue
cluster_centers[i,] = dataset[j, ]
break
return cluster_centers
#主程序
def Kmeans(dataset,k):
row_m=shape(dataset)[0]
cluster_assign=zeros((row_m,2))
center=get_centroids(dataset,k)
change=True
while change:
change=False
for i in range(row_m):
mindist=inf
min_index=-1
for j in range(k):
distance1=distance(center[j,:],dataset[i,:])
if distance1<mindist:
mindist=distance1
min_index=j
if cluster_assign[i,0] != min_index:
change=True
cluster_assign[i,:]=min_index,mindist**2
for cen in range(k):
cluster_data=dataset[nonzero(cluster_assign[:,0]==cen)]
center[cen,:]=mean(cluster_data,0)
return center ,cluster_assign
cluster_center,cluster_assign=Kmeans(datas,3)
print(cluster_center)
#设置x,y轴的范围
xlim(0, 10)
ylim(0, 10)
#做散点图
f1 = plt.figure(1)
plt.scatter(datas[nonzero(cluster_assign[:,0]==0),0],datas[nonzero(cluster_assign[:,0]==0),1],marker='o',color='r',label='0',s=30)
plt.scatter(datas[nonzero(cluster_assign[:,0]==1),0],datas[nonzero(cluster_assign[:,0]==1),1],marker='+',color='b',label='1',s=30)
plt.scatter(datas[nonzero(cluster_assign[:,0]==2),0],datas[nonzero(cluster_assign[:,0]==2),1],marker='*',color='g',label='2',s=30)
plt.scatter(cluster_center[:,1],cluster_center[:,0],marker = 'x', color = 'm', s = 50)
plt.show()
K-means++程序运行结果:
类中心:
[[ 4.92859254 4.93144926]
[ 2.94805141 2.84547461]
[ 7.16504475 7.12121176]]