Python实现KMeans完整过程_pycharm kmeans.fit-CSDN博客

本文链接：https://blog.csdn.net/Trisyp/article/details/89499998
先附上代码，学习笔记回头再补充
完整代码如下：
# -*- coding: utf-8 -*-
# 关闭警告
# import warnings
# warnings.filterwarnings('ignore')

import  pandas as pd
import numpy as np
from  scipy.spatial.distance import  cdist
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,cross_validate
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.cluster import KMeans
from sklearn import preprocessing as prep
import matplotlib.pyplot as plt

class Cluster():
    # #数据读取
    def data_read(self, data_path, file_name, typeid):
        '''
        数据读取
        :param data_path: 文件存储路径
        :param file_name: 文件名
        :param typeid: 价位段
        :return: 价位段原始数据和价位段去无关变量数据
        '''
        data = pd.read_excel(data_path + file_name, index_col='pack_bar').dropna()  # 删除缺失记录
        data1_type = data[data['typeid'].isin(typeid)]  # 取出价位段内记录
        # data1_type = data1_type[data1_type['ccom_id'].isin([11110001])]  # 取出某地市记录
        data_type = data1_type.drop(['typeid', 'month_double', 'ccom_id', 'net_month_double'], 1)  # ,'net_date'删除无关自变量
        return data1_type, data_type

    def outlier_filtrate(self,data_type,method='std',fill='nan',threshold=1):
        '''
        异常值处理机制
        :param data_type: 原始数据
        :param method: 处理方法，{'std':'正态异常','quantile':'箱线异常','threshold':'定值异常'}
        :param fill: 值填充方法，{'nan':'空值','extremum':'极值替换'}
        :param threshold: 异常值判断阈值，仅当method是threshold有效
        :return:新数据
        '''
        ncol = data_type.shape[1]
        colnames = data_type.columns
        colnames2 = list(filter(lambda x:x.find('_incr')>0,colnames))  # 仅判断增长率数据
        data2_type = data_type.copy()
        for i in range(ncol):
            datai = data2_type.iloc[:,i]
            # 正态异常
            if method == 'std':
                xmean = datai.mean()
                xstd = datai.std()
                up = xmean + 2 * xstd
                dw = xmean - 2 * xstd
                if any(datai > up):
                    print('存在上限异常值')
                    if fill == 'nan':
                        data2_type.iloc[:, i][datai > up] = np.nan
                    else:
                        data2_type.iloc[:, i][datai > up] = datai[datai < up].max()
                else:
                    print('不存在上限异常值')
                if any(datai < dw):
                    print('存在下限异常值')
                    if fill == 'nan':
                        data2_type.iloc[:, i][datai < dw] = np.nan
                    else:
                        data2_type.iloc[:, i][datai < dw] = datai[datai < dw].min()
                else:
                    print('不存在下限异常值')
            # 箱线图异常
            if method == 'quantile':
                q1 = datai.quantile(0.25)
                q3 = datai.quantile(0.75)
                up = q3 + 1.5 * (q3 - q1)
                dw = q1 - 1.5 * (q3 - q1)
                if any(datai > up):
                    print('存在上限异常值')
                    if fill == 'nan':
                        data2_type.iloc[:, i][datai > up] = np.nan
                    else:
                        data2_type.iloc[:, i][datai > up] = datai[datai < up].max()
                else:
                    print('不存在上限异常值')
                if any(datai < dw):
                    print('存在下限异常值')
                    if fill == 'nan':
                        data2_type.iloc[:, i][datai < dw] = np.nan
                    else:
                        data2_type.iloc[:, i][datai < dw] = datai[datai < dw].min()
                else:
                    print('不存在下限异常值')
            # 超过阈值异常
            if method == 'threshold':
                # 箱线图监测
                if colnames2.__contains__(colnames[i]):
                    up = threshold
                    dw = (-1.0)*threshold
                    if any(datai > up):
                        print('存在上限异常值')
                        if fill == 'nan':
                            data2_type.iloc[:, i][datai > up] = np.nan
                        else:
                            data2_type.iloc[:, i][datai > up] = up
                    else:
                        print('不存在上限异常值')
                    if any(datai < dw):
                        print('存在下限异常值')
                        if fill == 'nan':
                            data2_type.iloc[:, i][datai < dw] = np.nan
                        else:
                            data2_type.iloc[:, i][datai < dw] = dw
                    else:
                        print('不存在下限异常值')

        # temp = abs(data2_type[colnames2]) <= threshold  # 判断是否异常
        # lab = temp.apply(lambda x: x.min(), axis=1)  # 每行只要有异常值就为False
        data2_type = data2_type.dropna()  # 删除增长率在1以上的记录
        return data2_type

    def corr_filtrate(self, data_type, thred_corr=0.4):
        '''
        根据相关性阈值筛选变量
        :param data_type:原数据
        :param thred_corr:相关性阈值
        :return:新数据
        '''
        corrX = data_type.corr()
        colnames = data_type.columns
        colnames3 = list()
        for j in range(corrX.shape[1] - 1):  # 删除相关系数大于0.5的变量
            for i in range(j + 1, corrX.shape[0] - 1):
                if abs(corrX.iloc[i, j]) >= thred_corr:
                    if np.mean(corrX.iloc[i, :]) < np.mean(corrX.iloc[:, j]):  # 去掉其中平均绝对相关系数较大的那一个
                        colnames3.append(colnames[j])
                    else:
                        colnames3.append(colnames[i])
                    break
        colnames4 = colnames.drop(list(set(colnames3)))
        data2_type = data_type[colnames4]
        return data2_type

    def vif_filtrate(self,data2_type, thred_vif=4):
        '''
        膨胀因子阈值筛选变量
        :param data2_type: 原数据
        :param thred_vif: 膨胀因子阈值
        :return: 新数据
        '''
        vif = [round(variance_inflation_factor(data2_type.values, i),2) for i in range(data2_type.shape[1])]  # 共线性检验
        data3_type = data2_type.copy()
        while sum(list(map(lambda x:x>=thred_vif,vif))) > 0:
            colnames = data3_type.columns[:-1]
            for i in range(vif.__len__()-1):  # 删除共线性较强的变量
                if vif[i] >= thred_vif :
                    data3_type = data3_type.drop(colnames[i], 1)
                    vif = [round(variance_inflation_factor(data3_type.values, i), 2) for i in
                           range(data3_type.shape[1])]  # 共线性检验
                    break
        return data3_type

    def data_scale(self, data3_type, method='normalize'):  # 数据标准化
        '''
        数据标准化（归一化）
        :param data3_type: 原数据
        :param method: 标准化方法，{'minmax':'0-1标准化',
                                    'z-score':'正态标准化',
                                    'normalize':'归一化'
                                    'maxabs':'缩放比例为绝对值最大值，并保留正负号',
                                    'robust':'四分之一和四分之三分位点之间'}
        :return: 新数据
        '''
        if method == 'minmax':
            # 0-1标准化
            data_minmax = prep.minmax_scale(data3_type, feature_range=(0, 1), axis=0, copy=True)  # 直接用标准化函数
            data_scale = pd.DataFrame(data=data_minmax, columns=data3_type.columns, index=data3_type.index)
        elif method == 'z-score':
            # z-score标准化
            data_zs = prep.scale(data3_type, axis=0, with_mean=True, with_std=True, copy=True)  # 直接用标准化函数
            data_scale = pd.DataFrame(data=data_zs, columns=data3_type.columns, index=data3_type.index)
        elif method == 'normalize':
            # 归一化处理
            data_norm = prep.normalize(data3_type, norm='l2', axis=1)  # 直接用标准化函数
            data_scale = pd.DataFrame(data=data_norm, columns=data3_type.columns, index=data3_type.index)
        elif method == 'maxabs':
            # 数据的缩放比例为绝对值最大值，并保留正负号，即在区间[-1, 1]内。唯一可用于稀疏数据scipy.sparse的标准化
            data_ma = prep.maxabs_scale(data3_type, axis=0, copy=True)
            data_scale = pd.DataFrame(data=data_ma, columns=data3_type.columns, index=data3_type.index)
        elif method == 'robust':
            # 通过 Interquartile Range(IQR) 标准化数据，即四分之一和四分之三分位点之间
            data_rb = prep.robust_scale(data3_type, axis=0, with_centering=True, with_scaling=True, copy=True)
            data_scale = pd.DataFrame(data=data_rb, columns=data3_type.columns, index=data3_type.index)
        data4_type = data_scale
        return data4_type

    def data_factor(self, data4_type, replace='dependent', threshold=0.05, colnames=None):
        '''
        目的：数据二值化
        :param data3_type: 原数据
        :param replace: 替换的列，{'all':'all','dependent':'因变量','colnames':'自己输入变量名'}
        :param threshold:二值化阈值
        :param colnames:list类型，存储列名,仅当replace值为colnames时有效
        :return:新数据
        '''
        data5_type = data4_type.copy()
        nrow = data5_type.shape[0]
        if replace == 'all':
            # 所有变量二值化
            data_binary = prep.binarize(data4_type, threshold=threshold,
                                        copy=True)  # 按照阈值threshold将数据转换成成0-1,小于等于threshold为 0
            data_new = pd.DataFrame(data=data_binary, columns=data5_type.columns, index=data5_type.index)
            data5_type = data_new
        elif replace == 'dependent':
            # 因变量二值化
            for i in range(nrow):
                value = 1 if data5_type.iloc[i, -1] > threshold else 0
                data5_type.iloc[i, -1] = value
        elif replace == 'colnames':
            # 指定变量二值化
            temp = data5_type[colnames]
            if colnames.__len__() > 1:
                data_binary = prep.binarize(temp, threshold=threshold,
                                            copy=True)  # 按照阈值threshold将数据转换成成0-1,小于等于threshold为 0
                data5_type[colnames] = pd.DataFrame(data=data_binary, columns=temp.columns, index=temp.index)
            else:
                for i in range(nrow):
                    value = 1 if temp.values[i] > threshold else 0
                    data5_type[colnames].values[i] = value
        # # 亚编码操作
        # 打印二值化后数据分布
        print(data5_type.iloc[:, -1].value_counts())
        # encoder = prep.OneHotEncoder()
        # X_OH = encoder.fit_transform(data3_type)  #
        # df = pd.DataFrame(X_OH.toarray())
        return data5_type

    def kmeans_cluster(self, data5_type):
        # #######输入参数########
        # data_type:价位段内相关数据
        # #######################
        data_train, data_test = train_test_split(data5_type,test_size=0.2)  # random_state=1234
        col_names = data_train.columns
        X = data_train[col_names[:-1]]

        K = range(1, 10)
        meandistortion = []
        for k in K:
            kmeans = KMeans(n_clusters=k)
            kmeans.fit(X)
            meandistortion.append((sum(np.min(cdist(X, kmeans.cluster_centers_, 'euclidean'), axis=1))) / X.shape[0])
        plt.subplot(2, 1, 2)
        plt.plot(K, meandistortion, 'bx-')
        plt.xlabel('k')
        plt.ylabel(u'centers')
        plt.title(u'Choose Best k')
        plt.show()

        kmeans = KMeans(n_clusters=2)
        kmeans.fit(X)
        X['cluster'] = kmeans.labels_
        X['cluster'].value_counts()

        centers = kmeans.cluster_centers_
        return centers

    def data_predict(self,data,colnames,estm):
        # #######输入参数########
        # data_type:价位段内相关数据;
        # model:用来做预测的分析模型
        # #######################
        # # 利用回归模型预测优先投放城市
        data_new = data[data['year1']==2016]
        data_new2 = data_new.drop(['year1', 'year', 'type_id'], 1)  # 删除无关变量
        X = data_new2[colnames]
        # data_g = self.data_group_byType(data_new2)
        predictY = estm.predict(X)
        result =  pd.Series(index=X.index.tolist(),data=predictY.tolist()) # 城市增长预测
        incr_ccom = result[result=='1'].index
        return  incr_ccom


if __name__ == '__main__':
    # ##文件路径
    data_path = 'C:\\Users\\90539\\PycharmProjects\\data\\'
    file_name = 'data.xlsx'
    typeid = ['B']

    obj2 = Cluster()
    data, data1 = obj2.data_read(data_path,file_name, typeid)
    # data11 = obj2.data_group_byType(data1)
    data2 = obj2.outlier_filtrate(data1, method='threshold', fill='nan', threshold=1)
    data3 = obj2.data_scale(data2, method='minmax')
    data4 = obj2.data_factor(data3, replace='dependent', threshold=0.1)
    centers = obj2.kmeans_cluster(data4)