如果文末代码链接失效请将邮箱留言或者私信
聚类概念
无监督问题:手里的数据没有标签
聚类:相似的东西分到一组
难点:如何评估,如何调参
K-MEANS算法
基本概念
1.需要得到簇的个数,需要指定K值
2.质心:均值,即向量各维度取平均值
3.距离的度量:如如果要判断两个样本点是否相似,就看他们的欧式距离或者余弦相似度。在使用欧式距离时,需要先对数据进行标准化,归一化,让x和y的取值范围在比较小的范围内浮动。
4.优化目标:
m
i
n
∑
i
=
1
k
∑
c
i
d
i
s
t
(
c
i
,
x
)
2
min\sum_{i=1}^{k}\sum_{c_i}dist(c_i,x)^2
min∑i=1k∑cidist(ci,x)2即簇当中的样本点离聚类中心越远越好。
工作流程
观察上面六幅图片:任务是要把第一幅图分为两类,第一步是在图中任意位置随机选择质心,如第二幅图,开始计算其他点于随机质心的距离,根据这些样本点到质心的距离对样本进行分类,然后将其分为两类,如第三幅图,然后重新计算质心(根据分类后的两类样本点各维度上的平均值),根据重新计算得到质心,如第四幅图,开始计算样本点到新质心的距离,然后重新分类,如第五幅图。然后再一次重新计算质心,如此反复,直到分类结果不在发生改变。
优缺点
K值(类别数)难以确定,一般设置多组K值看效果
复杂度和样本数呈线性关系
很难发现任意形状的簇如:
代码介绍
首先第一步为随机得到聚类中心点
import numpy as np
class KMeans:
def __init__(self,data,num_clustres):
self.data = data
self.num_clustres = num_clustres
def train(self,max_iterations):
#1、先随机选择k个中心点
centroids = KMeans.centroids_init(self.data,self.num_clustres)
#2、开始训练
num_examples = self.data.shape[0]
#data.shape[0]用于返回行数
closest_centroids = np.empty((num_examples,1))
for _ in range(max_iterations):
cloest_centroids_ids = KMeans.centroids_find_closest(self.data,centroids)
@staticmethod
def centroids_init(self,data,num_clustres):
num_examples = data.shape[0]
random_ids = np.random.permutation(num_examples)
centroids = data[random_ids[:num_clustres],:]
return centroids
然后对样本归属进行划分
def centroids_find_closest(self,data,centroids):
num_examples = self.data.shape[0] #样本点的个数
num_centroids = centroids.shape[0]
closest_centroids_ids = np.zeros((num_examples,1))
for example_index in range(num_examples):
for centroid_index in range(num_centroids):
distance = np.zeros(num_centroids,1)
distance_diff = data[example_index,:]-centroids[centroid_index,:]
distance[centroid_index] = np.sum(distance_diff**2)
closest_centroids_ids[example_index] = np.argmin(distance)
return closest_centroids_ids
最后更新数据中心,不断迭代
def centroids_compute(self,data,closest_centroids_ids,num_clustres):
num_features = data.shape[1]
centroids = np.zeros((num_clustres,num_features))
for centroid_id in range(num_clustres):
closest_ids = closest_centroids_ids == centroid_id
centroids[centroid_id] = np.mean(data[closest_ids.flatten(),:],axis=0)
return centroids
整个Kmeans的聚类class定义如下
import numpy as np
class KMeans:
def __init__(self,data,num_clustres):
self.data = data
self.num_clustres = num_clustres
def train(self,max_iterations):
#1、先随机选择k个中心点
centroids = KMeans.centroids_init(self.data,self.num_clustres)
#2、开始训练
num_examples = self.data.shape[0]
#data.shape[0]用于返回行数
closest_centroids_ids = np.empty((num_examples,1))
for _ in range(max_iterations):
closest_centroids_ids = KMeans.centroids_find_closest(self.data,centroids)
#进行中心点位置更新
centroids = KMeans.centroids_compute(self.data,closest_centroids_ids,self.num_clustres)
return centroids,closest_centroids_ids
@staticmethod
def centroids_init(data,num_clustres):
num_examples = data.shape[0]
random_ids = np.random.permutation(num_examples)
centroids = data[random_ids[:num_clustres],:]
return centroids
@staticmethod
def centroids_find_closest(data,centroids):
num_examples = data.shape[0] #样本点的个数
num_centroids = centroids.shape[0]
closest_centroids_ids = np.zeros((num_examples,1))
for example_index in range(num_examples):
distance = np.zeros((num_centroids,1))
for centroid_index in range(num_centroids):
distance_diff = data[example_index,:]-centroids[centroid_index,:]
distance[centroid_index] = np.sum(distance_diff**2)
closest_centroids_ids[example_index] = np.argmin(distance)
return closest_centroids_ids
@staticmethod
def centroids_compute(data,closest_centroids_ids,num_clustres):
num_features = data.shape[1]
centroids = np.zeros((num_clustres,num_features))
for centroid_id in range(num_clustres):
closest_ids = closest_centroids_ids == centroid_id
centroids[centroid_id] = np.mean(data[closest_ids.flatten(),:],axis=0)
return centroids
对鸢尾花数据集进行聚类
数据来源:http://download.tensorflow.org/data/iris_training.csv,可能需要对标签进行修改,如果不想自己修改,可以使用文末的链接
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from k_means import KMeans
data = pd.read_csv('iris.csv')
iris_types = ['setosa','versicolor','virginica']
x_axis = 'petal_length'
y_axis = 'petal_width'
plt.figure(figsize = (12,5))
plt.subplot(1,2,1)
for iris_type in iris_types:
plt.scatter(data[x_axis][data['class']==iris_type],data[y_axis][data['class']==iris_type],label = iris_type)
plt.title('label known')
plt.legend()
plt.subplot(1,2,2)
plt.scatter(data[x_axis][:],data[y_axis][:])
plt.title('label unknown')
plt.show()
num_examples = data.shape[0]
x_train = data[[x_axis,y_axis]].values.reshape(num_examples,2)
#指定训练的参数
num_clusters = 3
max_iteritions = 50
k_means = KMeans(x_train,num_clusters)
centroids,closest_centroids_ids = k_means.train(max_iteritions)
#对比结果
plt.figure(figsize = (12,5))
plt.subplot(1,2,1)
for iris_type in iris_types:
plt.scatter(data[x_axis][data['class']==iris_type],data[y_axis][data['class']==iris_type],label = iris_type)
plt.title('label known')
plt.legend()
plt.subplot(1,2,2)
for centroid_id,centroid in enumerate(centroids):
current_examples_index = (closest_centroids_ids == centroid_id).flatten()
plt.scatter(data[x_axis][current_examples_index],data[y_axis][current_examples_index],label = centroid_id)
for centroid_id,centroid in enumerate(centroids):
plt.scatter(centroid[0],centroid[1],c='black',marker = 'x')
plt.legend()
plt.title('k means')
plt.show()
链接:https://pan.baidu.com/s/1aod20TFcWNs3_mNKq0ZngQ
提取码:p3pb