用K-means聚类分类不同行业的关税模型

这里的话是对不同行业做一个聚类

在之前是做过了VAR模型的拟合,然后用VAR的方差分解,Fevd来作为这个学习参数

跟大家分享一下

有借鉴了sklearn官网的代码

有什么想要交流的欢迎评论

import matplotlib.pyplot as plt

from VARModelFitting import *
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin
import scipy.interpolate as spi
import time


class MyIndustry(object):
    def __init__(self, Data: pd.DataFrame, name, lag=3):
        self.name = name
        self.rawDecompInfo = self.fevdIndentity(Data, lag, 10)
        self.DataJointing()
        # print(name,':\n',self.rawDecompInfo)
        pass

    @staticmethod
    def fevdIndentity(Data, lag, MaxPeriod):
        return VARFitter(Data, lag)[1].fevd(MaxPeriod).decomp

    def DataJointing(self, sepecificFevd=-1, enhanceFlag=True, enhanceCount=1000):
        if sepecificFevd < 0:
            output = []
            for item in self.rawDecompInfo:
                if isinstance(output, np.ndarray):
                    output = np.concatenate((output, item), axis=0)
                else:
                    output = item
        else:
            output = self.rawDecompInfo[sepecificFevd]
            print("output:\n", output)
            if enhanceFlag:
                output = self.DataEnhance(output, enhanceCount)
                print("enhanceFlag output:\n", output)
        return output

    @staticmethod
    def DataEnhance(data: np.ndarray, _min=-1, _max=-1, enhanceCount=50):
        if _min < 0 or _max < 0:
            _min = 0
            _max = data.shape[0]
        output = []
        rawData = data.T
        periods = np.arange(_min, _max)
        amplifiedPeriods = np.linspace(_min, _max, enhanceCount)
        for row in rawData:
            tck = spi.splrep(periods, row)
            result = spi.splev(amplifiedPeriods, tck, der=0)
            result = result.reshape(result.shape[0], 1)
            if isinstance(output, np.ndarray):
                output = np.hstack((output, result))
            else:
                output = result
        return output

    def industryIdentityDefine(self, k_means_cluster_centers, mbk_means_cluster_centers, n_clusters, sepecificFevd=0):
        inputs = self.DataJointing(sepecificFevd=sepecificFevd, enhanceFlag=False)
        # 这里就是使用pairwise_distances_argmin,来进行簇分类了,很妙
        k_means_labels = pairwise_distances_argmin(inputs, k_means_cluster_centers)
        mbk_means_labels = pairwise_distances_argmin(inputs, mbk_means_cluster_centers)
        k_means_result = np.zeros((n_clusters, 1))
        mbk_means_result = np.zeros((n_clusters, 1))
        for i in range(n_clusters):
            k_means_result[i] = sum(k_means_labels == i)
            mbk_means_result[i] = sum(mbk_means_labels == i)
        return k_means_result, mbk_means_result


def industryIdentityShow(k_means_cluster_centers, mbk_means_cluster_centers, n_clusters, sepecificFevd=0):
    plt.figure("industryIdentityShow", figsize=(20, 16))
    Data_base, classNames = GetClasses()
    for name, idx in zip(classNames, range(0, len(classNames))):
        Data = ReadTariffData(Data_base, name)
        Data = ProcessBaseData(Data)
        name = name.strip()
        try:
            k_means_result, mbk_means_result = \
                MyIndustry(Data, name).industryIdentityDefine(k_means_cluster_centers, mbk_means_cluster_centers,
                                                              n_clusters, sepecificFevd)
            # 这里数据结构化有点毛病
            plt.subplot(5, 5, idx + 1), plt.title(name)
            plt.bar(np.arange(k_means_result.shape[0]), k_means_result.T[0])
            # plt.subplot(5, 10, 2 * idx + 2), plt.title(name)
            # plt.bar(np.arange(k_means_result.shape[0]), mbk_means_result.T[0]), plt.axis('off')
        except np.linalg.LinAlgError as e:
            print(name + ": fail the var model LinAlgError: ", e)
            print(repr(e))

    plt.show()
    pass


def clusteringPeriod(inputs, batch_size=45, n_clusters=5):
    # #############################################################################
    # k means
    k_means = KMeans(init="k-means++", n_clusters=n_clusters, n_init=10)
    t0 = time.time()
    k_means.fit(inputs)
    t_batch = time.time() - t0

    print('cluster_centers of k_means:\n', k_means.cluster_centers_)

    # #############################################################################
    # Compute clustering with MiniBatchKMeans

    mbk = MiniBatchKMeans(
        init="k-means++",
        n_clusters=n_clusters,
        batch_size=batch_size,
        n_init=10,
        max_no_improvement=10,
        verbose=0,
    )
    t0 = time.time()
    mbk.fit(inputs)
    t_mini_batch = time.time() - t0

    print('cluster_centers of MiniBatchKMeans:\n', mbk.cluster_centers_)

    return k_means, t_batch, mbk, t_mini_batch


def clusteringComparingPloting(inputs, batch_size=45, n_clusters=5):
    k_means, t_batch, mbk, t_mini_batch = clusteringPeriod(inputs, batch_size, n_clusters)
    n_clusters = k_means.n_clusters
    # #############################################################################
    # Plot result

    fig = plt.figure(figsize=(8, 3))
    fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
    colors = ["#4EACC5", "#FF9C34", "#4E9A06"]

    # We want to have the same colors for the same cluster from the
    # MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
    # closest one.
    k_means_cluster_centers = k_means.cluster_centers_
    order = pairwise_distances_argmin(k_means.cluster_centers_, mbk.cluster_centers_)
    mbk_means_cluster_centers = mbk.cluster_centers_[order]

    # 这里就是使用pairwise_distances_argmin,来进行簇分类了,很妙
    k_means_labels = pairwise_distances_argmin(inputs, k_means_cluster_centers)
    mbk_means_labels = pairwise_distances_argmin(inputs, mbk_means_cluster_centers)

    # KMeans
    ax = fig.add_subplot(1, 3, 1)
    for k, col in zip(range(n_clusters), colors):
        my_members = k_means_labels == k
        cluster_center = k_means_cluster_centers[k]
        ax.plot(inputs[my_members, 0], inputs[my_members, 1], "w", markerfacecolor=col, marker=".")
        ax.plot(
            cluster_center[0],
            cluster_center[1],
            "o",
            markerfacecolor=col,
            markeredgecolor="k",
            markersize=6,
        )
    ax.set_title("KMeans")
    ax.set_xticks(())
    ax.set_yticks(())
    plt.text(-3.5, 1.8, "train time: %.2fs\ninertia: %f" % (t_batch, k_means.inertia_))

    # MiniBatchKMeans
    ax = fig.add_subplot(1, 3, 2)
    for k, col in zip(range(n_clusters), colors):
        my_members = mbk_means_labels == k
        cluster_center = mbk_means_cluster_centers[k]
        ax.plot(inputs[my_members, 0], inputs[my_members, 1], "w", markerfacecolor=col, marker=".")
        ax.plot(
            cluster_center[0],
            cluster_center[1],
            "o",
            markerfacecolor=col,
            markeredgecolor="k",
            markersize=6,
        )
    ax.set_title("MiniBatchKMeans")
    ax.set_xticks(())
    ax.set_yticks(())
    plt.text(-3.5, 1.8, "train time: %.2fs\ninertia: %f" % (t_mini_batch, mbk.inertia_))

    # Initialise the different array to all False
    different = mbk_means_labels == 4
    ax = fig.add_subplot(1, 3, 3)

    for k in range(n_clusters):
        different += (k_means_labels == k) != (mbk_means_labels == k)

    identic = np.logical_not(different)
    ax.plot(inputs[identic, 0], inputs[identic, 1], "w", markerfacecolor="#bbbbbb", marker=".")
    ax.plot(inputs[different, 0], inputs[different, 1], "w", markerfacecolor="m", marker=".")
    ax.set_title("Difference")
    ax.set_xticks(())
    ax.set_yticks(())

    plt.show()

    return k_means_cluster_centers, mbk_means_cluster_centers, n_clusters


# 总之就是输出一个矩阵,矩阵的每一行都是我们进行学习的一个元素
def unpackData(unpackSepecificItem=-1, enhanceFlag=True):
    Data_base, classNames = GetClasses()
    output = []
    for name in classNames:
        Data = ReadTariffData(Data_base, name)
        Data = ProcessBaseData(Data)
        name = name.strip()
        try:
            if isinstance(output, np.ndarray):
                output = np.concatenate((output, MyIndustry(Data, name).DataJointing(unpackSepecificItem, enhanceFlag)),
                                        axis=0)
            else:
                output = MyIndustry(Data, name).DataJointing(unpackSepecificItem, enhanceFlag)
        except TypeError as e:
            print(name + ": fail the var model, TypeError")
            print(repr(e))
            # warnings.warn(name+": fail the var model")
            continue
        except np.linalg.LinAlgError as e:
            print(name + ": fail the var model LinAlgError: ", e)
            print(repr(e))
        except Exception as e:
            print(name + ": fail the var model, unknown: ", e)
            print(repr(e))
            # warnings.warn(name+": fail the var model")
            continue
    print(output.shape)
    return output


def main():
    data = unpackData(unpackSepecificItem=1, enhanceFlag=True)
    k_means_cluster_centers, mbk_means_cluster_centers, n_clusters = clusteringComparingPloting(data, n_clusters=5)
    industryIdentityShow(k_means_cluster_centers, mbk_means_cluster_centers, n_clusters, sepecificFevd=0)
    pass


if __name__ == '__main__':
    main()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值