基于sklearn中文文本聚类

实习期间老大给了一个任务,给运维历史数据分类,需要先做聚类,然后把每一类总结出来。目前只是实现了文本聚类。

# -*- coding: utf-8 -*-
"""
Created on Wed Jul 18 15:53:56 2018

@author: zs

"""
import re
import time
import jieba
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import Birch
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import matplotlib.cm as cm


# 导入数据集函数,返回聚类的数据与对应ID
def loadDataSet(filename):
    dataset = pd.read_csv(filename,encoding='utf-8')
    m,n = dataset.shape  # 获取行、列
    data = dataset.values[:,-1]
    dataID = dataset.values[:,0]
    return data.reshape((m,1)), dataID.reshape((m,1))

# numpy 转化为 list
def ndarrayToList(dataArr):
    dataList = []
    m,n = dataArr.shape
    for i in range(m):
        for j in range(n):
            dataList.append(dataArr[i,j])
    return dataList

# 去掉字符串、特殊符号
def removeStr(listData):
    strData = "".join(listData)
    removeStrData = re.sub("[\s+\!\,$^*()+\"\']+:|[+——!,,《》“”〔【】;:。?、�./-~@#¥……&*()]+", "",strData)
    return removeStrData

# 创建停用词列表
def stopwordslist(filePath):
    stopword = [line.strip() for line in open(filePath,'r',encoding='utf-8').readlines()]
    return stopword

# 保存文件
def saveFile(filename):
    with open(filename,'a') as fr:
        for line in dataSplit:
            strLine = ' '.join(line)
            fr.write(strLine)    
            fr.write('\n')
        fr.close()

# 对数据集分词、去停用词
def wordSplit(data):
    stopword = stopwordslist('./data/stopwords.txt')  # 创建通用词列表
    word = ndarrayToList(data)
    m = len(word)
    wordList = []
    for i in range(m):
        rowListRemoveStr = removeStr(word[i])    # 去特殊符号
        rowList = [eachWord for eachWord in jieba.cut(rowListRemoveStr)]  # 分词
        removeStopwordList = []
        for eachword in  rowList:
            if eachword not in stopword and eachword != '\t' and eachword != ' ' :
                removeStopwordList.append(eachword)
        wordList.append(removeStopwordList)
    return wordList

# 计算 tf-idf 值
def TFIDF(wordList):
    corpus = []   # 保存预料
    for i in range(len(wordList)):
        wordList[i] = " ".join(wordList[i])
        corpus.append(wordList[i])
    # 将文本中的词语转换成词频矩阵,矩阵元素 a[i][j] 表示j词在i类文本下的词频
    vectorizer = CountVectorizer()
    # 该类会统计每个词语tfidf权值
    transformer = TfidfTransformer()
    # 第一个fit_transform是计算tf-idf 第二个fit_transform是将文本转为词频矩阵
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    # 获取词袋模型中的所有词语
    word = vectorizer.get_feature_names()
    # 将tf-idf矩阵抽取出来,元素w[i][j]表示j词在i类文本中的tf-idf权重  
    weight = tfidf.toarray()
    
    return word,weight

# 对生成的 tfidf 矩阵做PCA降维
'''
权重矩阵非常稀疏,使用PCA降维(为什么不是SVD降维) SVD适合稠密矩阵降维
'''
def matrixPCA(weight,dimension):
    pca = PCA(n_components = dimension)  # 初始化PCA
    pcaMatrix = pca.fit_transform(weight)        # 返回降维后的数据
    print("降维之前的权重维度:",weight.shape)
    print("降维之后的权重维度:",pcaMatrix.shape)
    return pcaMatrix

# 层级聚类 birch  k-means适合维度低且速度慢
def birch(matrix,k):
    clusterer = Birch(n_clusters=k)  # 分成簇的个数
    y = clusterer.fit_predict(matrix)    # 聚类结果
    return y

# 计算轮廓系数
def Silhouette(matrix, y):
    silhouette_avg = silhouette_score(matrix, y)   # 平均轮廓系数
    sample_silhouette_values = silhouette_samples(matrix, y)  # 每个点的轮廓系数
    print(silhouette_avg)
    return silhouette_avg, sample_silhouette_values

# 画图
def Draw(silhouette_avg, sample_silhouette_values, y, k):
    fig, ax1 = plt.subplots(1)
    fig.set_size_inches(18, 7)
    # 第一个 subplot 放轮廓系数点
    # 范围是[-1, 1]
    ax1.set_xlim([-0.2, 0.5])
    # 后面的 (k + 1) * 10 是为了能更明确的展现这些点
    #ax1.set_ylim([0, len(X) + (k + 1) * 10])
    y_lower = 10
 
    for i in range(k): # 分别遍历这几个聚类
        ith_cluster_silhouette_values = sample_silhouette_values[y == i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
        color = cm.spectral(float(i)/k)  # 搞一款颜色
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0,
                          ith_cluster_silhouette_values,
                          facecolor=color,
                          edgecolor=color,
                          alpha=0.7) 
        # 在轮廓系数点这里加上聚类的类别号
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        # 计算下一个点的 y_lower y轴位置
        y_lower = y_upper + 10
    # 在图里搞一条垂直的评论轮廓系数虚线
    ax1.axvline(x=silhouette_avg, color='red', linestyle="--")
    plt.show()

# 保存聚类结果
def saveResult(data,y):
    y = y.reshape((len(data),1))
    for i in range(12):
        filename = './result1/result' + str(i) + '.csv'   # 文件名
        with open(filename,'a',encoding='utf8') as fr:
            for j in range(13001):
                if y[j] == i:
                    strLine = ''.join(data[j])
                    fr.write(strLine)
                    fr.write('\n')
            fr.close()

if __name__ == "__main__":
    
    # start time
    start = time.clock()
    k = 12  # 聚成12类
    jieba.load_userdict('./data/user_dict.txt')  # 添加分词字典
    data,dataId = loadDataSet('./data/new_gongdan.csv')
    dataSplit = wordSplit(data)
    print('分词完成')
    saveFile('./data/new_gongdan_split.csv')  # 保存分词结果
    word, weight = TFIDF(dataSplit)  # 生成 tfidf 矩阵
    weightPCA = weight

    # 将原始矩阵降维,降维后效果反而没有不降维的好
    #weightPCA = matrixPCA(weight, dimension = 1000) 
    y = birch(weightPCA, k)
    silhouette_avg, sample_silhouette_values = Silhouette(weightPCA, y) # 轮廓系数
    
    Draw(silhouette_avg, sample_silhouette_values, y, k)
    saveResult(data,y)  # 保存聚类结果,一类保存为一个csv文件
    
    elapsed = (time.clock() - start)
    print('Time use', elapsed)

 

阅读更多
换一批

没有更多推荐了,返回首页