基于K-means的动态主题模型话题分类

 


from numpy import array, zeros, argmin, inf, ndim
import scipy.stats
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import euclidean_distances


def dtw(x, y, warp=1):
    assert len(x)
    assert len(y)
    r, c = len(x), len(y)
    D0 = zeros((r + 1, c + 1))
    D0[0, 1:] = inf
    D0[1:, 0] = inf
    D1 = D0[1:, 1:]  # view
    for i in range(r):
        for j in range(c):
            D1[i, j] = euclidean_distances([x[i]], [y[j]])
    C = D1.copy()
    for i in range(r):
        for j in range(c):
            min_list = [D0[i, j]]
            for k in range(1, warp + 1):
                i_k = min(i + k, r - 1)
                j_k = min(j + k, c - 1)
                min_list += [D0[i_k, j], D0[i, j_k]]
            D1[i, j] += min(min_list)

    return D1[-1, -1] / sum(D1.shape)


# 加载数据
def loadDataSet(fileName):
    data = np.loadtxt(fileName, delimiter=",", skiprows=1)
    return data


def saveDataSet(fileNmae, dataSet):
    np.savetxt(fileNmae, dataSet, delimiter=',')


# 为给定数据集构建一个包含K个随机质心的集合
def randCent(dataSet, k):
    m, n, p = dataSet.shape
    centroids = np.zeros((k, n, p))
    for i in range(k):
        index = int(np.random.uniform(0, m))  #
        centroids[i, :, :] = dataSet[index, :, :]
    return centroids


# k均值聚类
def KMeans(dataSet, k):
    m = np.shape(dataSet)[0]  # 行的数目
    # 第一列存样本属于哪一簇
    # 第二列存样本的到簇的中心点的误差
    clusterAssment = np.mat(np.zeros((m, 2)))
    clusterChange = True

    # 第1步 初始化centroids
    centroids = randCent(dataSet, k)
    while clusterChange:
        clusterChange = False

        # 遍历所有的样本(行数)
        for i in range(m):
            minDist = 100000.0
            minIndex = -1

            # 遍历所有的质心
            # 第2步 找出最近的质心
            for j in range(k):
                # 计算该样本到质心的欧式距离
                distance = dtw(centroids[j, :, :], dataSet[i, :, :])
                if distance < minDist:
                    minDist = distance
                    minIndex = j
            # 第 3 步:更新每一行样本所属的簇
            if clusterAssment[i, 0] != minIndex:
                clusterChange = True
                clusterAssment[i, :] = minIndex, minDist ** 2
        # 第 4 步:更新质心
        for j in range(k):
            pointsInCluster = dataSet[np.nonzero(clusterAssment[:, 0].A == j)[0]]  # 获取簇类所有的点
            centroids[j, :, :] = np.mean(pointsInCluster, axis=0)  # 对矩阵的行求均值

    print("Congratulations,cluster complete!")
    return centroids, clusterAssment


def showCluster(dataSet, k, centroids, clusterAssment):
    m, n, p = dataSet.shape

    # 绘制所有的样本
    for i in range(m):
        markIndex = int(clusterAssment[i, 0])
        plt.plot(dataSet[i, 0, 0], dataSet[i, 0, 1])

    plt.show()


if __name__ == '__main__':
    start = time.clock()

    dataSet = loadDataSet("D:/151617-dtm-new/mds.csv")

    m, n = dataSet.shape
    dataSet_1 = np.zeros((m, n / 2, 2))
    for i in range(m):
        for j in range(n / 2):
            dataSet_1[i, j, 0] = dataSet[i, 2 * j]
            dataSet_1[i, j, 1] = dataSet[i, 2 * j + 1]
    m, n, p = dataSet_1.shape
    k = 4
    centroids, clusterAssment = KMeans(dataSet_1, k)
    saveDataSet('D:/151617-dtm-new/cluster.csv', clusterAssment)
    # #
    showCluster(dataSet_1, k, centroids, clusterAssment)

    elapsed = (time.clock() - start)
    print("Time used:", elapsed)

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值