python-机器学习-K-Mean算法

最新推荐文章于 2023-08-29 22:21:38 发布

weixin_35830789

最新推荐文章于 2023-08-29 22:21:38 发布

阅读量422

点赞数

文章标签：机器学习 python 算法

本文链接：https://blog.csdn.net/weixin_35830789/article/details/122370292

版权

python-机器学习-K-Mean算法
本文是用python学习机器学习系列第四篇
都是自己重新用代码实现K-Mean算法，而不是调用算法包
代码如下：


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import preprocessing
import re
from collections import defaultdict
from sklearn.model_selection import train_test_split
import sklearn.datasets as ds
import matplotlib.colors

#K-Mean
class KMean(object):
    """K-Mean 算法实现"""

    # 初始化
    def __init__(self,traindata=np.array([]),n_clusters=2, kernel='LINE',mode='RANDOM'):
        self.kernel = kernel # 计算距离方式
        self.traindata = traindata #训练数据
        self.n_clusters = n_clusters #簇数
        self.mode = mode
        self.centers = np.array([])
        self.y_hat = []

    # 拟合函数
    def fit_predict(self,data):
        self.traindata = data
        self.centers = self.cal_firstChoose()
        y_hat = self.cal_classify(self.centers)
        while self.y_hat != y_hat:
            self.y_hat = y_hat
            self.cal_centers()
            y_hat = self.cal_classify(self.centers)
        return self.y_hat

    # 预测多个样本
    def predict(self, x):
        y_hat = np.array([])
        for i in range(len(x)):
            y_hatone = self.predict_one(x[i])
            y_hat = np.append(y_hat,y_hatone)
        return y_hat

    #预测一个样本
    def predict_one(self,x):
        y_hat = 0.0
        for i in range(len(self.a)):
            y_hat += self.a[i] * self.yl[i] * self.cal_kenrel(x,self.xl[i])
        y_hat+=self.b
        if y_hat >= 0:
            y_hat = 1
        else:
            y_hat = -1
        return y_hat

    #计算两点距离
    def cal_distanc(self,x,y):
        distanc = 100000
        # 笛卡尔距离
        if self.kernel.upper() == 'LINE':
            distanc = np.sqrt(np.sum(np.square(x - y)))
        elif self.kernel.upper() == 'LINE':
            distanc = (np.inner(x , y)) ** 0.5
        return distanc
    
    #初始簇中心选择
    def cal_firstChoose(self):
        if self.mode == 'RANDOM':
            randomInt = np.random.randint(0,len(self.traindata),self.n_clusters)
            centers = self.traindata[randomInt]
            return centers
        elif self.mode == 'K++':
            return
        return

    #计算簇中心-one
    def cal_center(self,x=[]):
        if len(x) == 0:
            return (-100000,-100000)
        sum = np.sum(x,axis=0)
        center = sum / len(x)
        return center

    #计算簇中心-all
    def cal_centers(self):
        centers = np.array([])
        xs = []
        for j in range(self.n_clusters):
            xs.append([])
        for i in range(len(self.traindata)):
            xs[self.y_hat[i]].append(self.traindata[i])
        for k in range(self.n_clusters):
                l = self.cal_center(xs[k])
                l = np.array(l)
                centers = np.append(centers,l,0)
        self.centers = centers.reshape(self.n_clusters,2)
        return self

    #计算距离并分类
    def cal_classify(self,x):
        y_hat = []
        # 遍历计算距离
        for j in range(len(self.traindata)):
            y_dist = np.array([])
            for i in range(len(x)):
                y_dist = np.append(y_dist,self.cal_distanc(x[i],self.traindata[j]))
            g = np.argwhere(y_dist == y_dist.min())
            y_hat.append(int(g[0]))
        return y_hat

def expand(a, b):
    d = (b - a) * 0.1
    return a - d, b + d


if __name__ == "__main__":
    N = 400
    centers = 4
    data, y = ds.make_blobs(N, n_features=2, centers=centers, random_state=2)
    data2, y2 = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=(1,2.5,0.5,2), random_state=2)
    data3 = np.vstack((data[y == 0][:], data[y == 1][:50], data[y == 2][:20], data[y == 3][:5]))
    y3 = np.array([0] * 100 + [1] * 50 + [2] * 20 + [3] * 5)

    cls = KMean(n_clusters=3)
    y_hat = cls.fit_predict(data)
    y2_hat = cls.fit_predict(data2)
    y3_hat = cls.fit_predict(data3)

    m = np.array(((1, 1), (1, 3)))
    data_r = data.dot(m)
    y_r_hat = cls.fit_predict(data_r)

    matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
    matplotlib.rcParams['axes.unicode_minus'] = False
    cm = matplotlib.colors.ListedColormap(list('rgbm'))

    plt.figure(figsize=(9, 10), facecolor='w')
    plt.subplot(421)
    plt.title(u'原始数据')
    plt.scatter(data[:, 0], data[:, 1], c=y, s=30, cmap=cm, edgecolors='none')
    x1_min, x2_min = np.min(data, axis=0)
    x1_max, x2_max = np.max(data, axis=0)
    x1_min, x1_max = expand(x1_min, x1_max)
    x2_min, x2_max = expand(x2_min, x2_max)
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.grid(True)

    plt.subplot(422)
    plt.title(u'KMeans++聚类')
    plt.scatter(data[:, 0], data[:, 1], c=y_hat, s=30, cmap=cm, edgecolors='none')
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.grid(True)

    plt.subplot(423)
    plt.title(u'旋转后数据')
    plt.scatter(data_r[:, 0], data_r[:, 1], c=y, s=30, cmap=cm, edgecolors='none')
    x1_min, x2_min = np.min(data_r, axis=0)
    x1_max, x2_max = np.max(data_r, axis=0)
    x1_min, x1_max = expand(x1_min, x1_max)
    x2_min, x2_max = expand(x2_min, x2_max)
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.grid(True)

    plt.subplot(424)
    plt.title(u'旋转后KMeans++聚类')
    plt.scatter(data_r[:, 0], data_r[:, 1], c=y_r_hat, s=30, cmap=cm, edgecolors='none')
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.grid(True)

    plt.subplot(425)
    plt.title(u'方差不相等数据')
    plt.scatter(data2[:, 0], data2[:, 1], c=y2, s=30, cmap=cm, edgecolors='none')
    x1_min, x2_min = np.min(data2, axis=0)
    x1_max, x2_max = np.max(data2, axis=0)
    x1_min, x1_max = expand(x1_min, x1_max)
    x2_min, x2_max = expand(x2_min, x2_max)
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.grid(True)

    plt.subplot(426)
    plt.title(u'方差不相等KMeans++聚类')
    plt.scatter(data2[:, 0], data2[:, 1], c=y2_hat, s=30, cmap=cm, edgecolors='none')
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.grid(True)

    plt.subplot(427)
    plt.title(u'数量不相等数据')
    plt.scatter(data3[:, 0], data3[:, 1], s=30, c=y3, cmap=cm, edgecolors='none')
    x1_min, x2_min = np.min(data3, axis=0)
    x1_max, x2_max = np.max(data3, axis=0)
    x1_min, x1_max = expand(x1_min, x1_max)
    x2_min, x2_max = expand(x2_min, x2_max)
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.grid(True)

    plt.subplot(428)
    plt.title(u'数量不相等KMeans++聚类')
    plt.scatter(data3[:, 0], data3[:, 1], c=y3_hat, s=30, cmap=cm, edgecolors='none')
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.grid(True)

    #plt.tight_layout(2)
    #plt.suptitle(u'数据分布对KMeans聚类的影响', fontsize=18)
    # https://github.com/matplotlib/matplotlib/issues/829
    plt.subplots_adjust(top=0.92)
    plt.show()