聚类分析:K-MEANS算法

如果文末代码链接失效请将邮箱留言或者私信

聚类概念

无监督问题:手里的数据没有标签
聚类:相似的东西分到一组
难点:如何评估,如何调参

K-MEANS算法

基本概念

1.需要得到簇的个数,需要指定K值
2.质心:均值,即向量各维度取平均值
3.距离的度量:如如果要判断两个样本点是否相似,就看他们的欧式距离或者余弦相似度。在使用欧式距离时,需要先对数据进行标准化,归一化,让x和y的取值范围在比较小的范围内浮动。
4.优化目标: m i n ∑ i = 1 k ∑ c i d i s t ( c i , x ) 2 min\sum_{i=1}^{k}\sum_{c_i}dist(c_i,x)^2 mini=1kcidist(ci,x)2即簇当中的样本点离聚类中心越远越好。

工作流程

在这里插入图片描述
观察上面六幅图片:任务是要把第一幅图分为两类,第一步是在图中任意位置随机选择质心,如第二幅图,开始计算其他点于随机质心的距离,根据这些样本点到质心的距离对样本进行分类,然后将其分为两类,如第三幅图,然后重新计算质心(根据分类后的两类样本点各维度上的平均值),根据重新计算得到质心,如第四幅图,开始计算样本点到新质心的距离,然后重新分类,如第五幅图。然后再一次重新计算质心,如此反复,直到分类结果不在发生改变。

优缺点

K值(类别数)难以确定,一般设置多组K值看效果
复杂度和样本数呈线性关系
很难发现任意形状的簇如:
在这里插入图片描述

代码介绍

首先第一步为随机得到聚类中心点

import numpy as np
class KMeans:
    def __init__(self,data,num_clustres):
        self.data = data
        self.num_clustres = num_clustres
    def train(self,max_iterations):
        #1、先随机选择k个中心点
        centroids = KMeans.centroids_init(self.data,self.num_clustres)
        #2、开始训练
        num_examples = self.data.shape[0]
        #data.shape[0]用于返回行数
        closest_centroids = np.empty((num_examples,1))
        for _ in range(max_iterations):
            cloest_centroids_ids = KMeans.centroids_find_closest(self.data,centroids)
                        
    @staticmethod
    def centroids_init(self,data,num_clustres):
        num_examples = data.shape[0]
        random_ids = np.random.permutation(num_examples)
        centroids = data[random_ids[:num_clustres],:]
        return centroids

然后对样本归属进行划分

def centroids_find_closest(self,data,centroids):
        num_examples = self.data.shape[0] #样本点的个数
        num_centroids = centroids.shape[0]
        closest_centroids_ids = np.zeros((num_examples,1))
        for example_index in range(num_examples):
            for centroid_index in range(num_centroids):
                distance = np.zeros(num_centroids,1)
                distance_diff = data[example_index,:]-centroids[centroid_index,:]
                distance[centroid_index] = np.sum(distance_diff**2)
            closest_centroids_ids[example_index] = np.argmin(distance)
          return closest_centroids_ids

最后更新数据中心,不断迭代

    def centroids_compute(self,data,closest_centroids_ids,num_clustres):  
        num_features = data.shape[1] 
        centroids = np.zeros((num_clustres,num_features))
        for centroid_id in range(num_clustres):
            closest_ids = closest_centroids_ids == centroid_id
            centroids[centroid_id] = np.mean(data[closest_ids.flatten(),:],axis=0)
        return centroids

整个Kmeans的聚类class定义如下

import numpy as np

class KMeans:
    def __init__(self,data,num_clustres):
        self.data = data
        self.num_clustres = num_clustres
    def train(self,max_iterations):
        #1、先随机选择k个中心点
        centroids = KMeans.centroids_init(self.data,self.num_clustres)
        #2、开始训练
        num_examples = self.data.shape[0]
        #data.shape[0]用于返回行数
        closest_centroids_ids = np.empty((num_examples,1))
        for _ in range(max_iterations):
            closest_centroids_ids = KMeans.centroids_find_closest(self.data,centroids)
            #进行中心点位置更新
            centroids = KMeans.centroids_compute(self.data,closest_centroids_ids,self.num_clustres)
        return centroids,closest_centroids_ids    
            
    @staticmethod
    def centroids_init(data,num_clustres):
        num_examples = data.shape[0]
        random_ids = np.random.permutation(num_examples)
        centroids = data[random_ids[:num_clustres],:]
        return centroids
    @staticmethod
    def centroids_find_closest(data,centroids):
        num_examples = data.shape[0] #样本点的个数
        num_centroids = centroids.shape[0]
        closest_centroids_ids = np.zeros((num_examples,1))
        for example_index in range(num_examples):
            distance = np.zeros((num_centroids,1))
            for centroid_index in range(num_centroids):
                distance_diff = data[example_index,:]-centroids[centroid_index,:]
                distance[centroid_index] = np.sum(distance_diff**2)
            closest_centroids_ids[example_index] = np.argmin(distance)
        return closest_centroids_ids
    @staticmethod        
    def centroids_compute(data,closest_centroids_ids,num_clustres):  
        num_features = data.shape[1] 
        centroids = np.zeros((num_clustres,num_features))
        for centroid_id in range(num_clustres):
            closest_ids = closest_centroids_ids == centroid_id
            centroids[centroid_id] = np.mean(data[closest_ids.flatten(),:],axis=0)
        return centroids
    

对鸢尾花数据集进行聚类

数据来源:http://download.tensorflow.org/data/iris_training.csv,可能需要对标签进行修改,如果不想自己修改,可以使用文末的链接

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from k_means import KMeans
data = pd.read_csv('iris.csv')
iris_types = ['setosa','versicolor','virginica']
x_axis = 'petal_length'
y_axis = 'petal_width'
plt.figure(figsize = (12,5))
plt.subplot(1,2,1)
for iris_type in iris_types:
    plt.scatter(data[x_axis][data['class']==iris_type],data[y_axis][data['class']==iris_type],label = iris_type)
plt.title('label known')
plt.legend()
plt.subplot(1,2,2)
plt.scatter(data[x_axis][:],data[y_axis][:])
plt.title('label unknown')
plt.show()
num_examples = data.shape[0]
x_train = data[[x_axis,y_axis]].values.reshape(num_examples,2)

#指定训练的参数
num_clusters = 3
max_iteritions = 50
k_means = KMeans(x_train,num_clusters)
centroids,closest_centroids_ids = k_means.train(max_iteritions)

#对比结果
plt.figure(figsize = (12,5))
plt.subplot(1,2,1)
for iris_type in iris_types:
    plt.scatter(data[x_axis][data['class']==iris_type],data[y_axis][data['class']==iris_type],label = iris_type)
plt.title('label known')
plt.legend()

plt.subplot(1,2,2)
for centroid_id,centroid in enumerate(centroids):
    current_examples_index = (closest_centroids_ids == centroid_id).flatten()
    plt.scatter(data[x_axis][current_examples_index],data[y_axis][current_examples_index],label = centroid_id)

for centroid_id,centroid in enumerate(centroids):
    plt.scatter(centroid[0],centroid[1],c='black',marker = 'x')
plt.legend()
plt.title('k means')
plt.show()

链接:https://pan.baidu.com/s/1aod20TFcWNs3_mNKq0ZngQ
提取码:p3pb

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值