# -*- coding: utf-8 -*-
"""
Created on Thu Aug 30 11:46:33 2018
@author: wenyun.wxw
"""
import time
import re
import os
import sys
import codecs
import shutil
import numpy as np
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
#计算TFIDF
corpus = data
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
words = vectorizer.get_feature_names()
weight = tfidf.toarray()
#聚类Kmeans
#---------选K------------------
#手肘法:画sse与K的图,拐点处为最佳K
SSE = [] # 存放每次结果的误差平方和
#K的上限为20
kmax=20
for k in range(1,kmax):
estimator = KMeans(n_clusters=k,init='k-means++') # 构造聚类器
estimator.fit(weight)
SSE.append(estimator.inertia_)
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(range(1,kmax),SSE,'o-')
plt.show()
#效果不好,看不出来拐点
#轮廓系数法,轮廓系数最大的k最佳
Scores = [] # 存放轮廓系数
for k in range(2,kmax):
estimator = KMeans(n_clusters=k,init='k-means++') # 构造聚类器
estimator.fit(weight)
Scores.append(silhouette_score(weight,estimator.labels_,metric='euclidean'))
plt.xlabel('k')
plt.ylabel('轮廓系数')
plt.plot(range(2,kmax),Scores,'o-')
plt.show()
#k=5时相对最大
#---------聚类------------------
K=7
clf = KMeans(n_clusters=K,init='k-means++')
s = clf.fit(weight)
#分类结果:每个样本所属的簇
label=clf.labels_
print(clf.inertia_)
file = open('分类结果.csv','w',encoding='gbk',errors='ignore')
for i in list(range(len(label))):
file.write(','.join((str(label[i]),str(data[i])))+'\n')
file.close()
#把每类分开
def divide(word,k):
word_part=[]
for i in range(len(label)):
if label[i]==k:
word_part.append(word[i])
#合并所有词
wordall=[]
for w in word_part:
wordall.extend(w)
#词频统计
wordcount= {} #字典
for item in wordall:
if item not in wordcount:
wordcount[item] = 1
else:
wordcount[item] += 1
#根据词频从高到低排序
wordcount_sort=sorted(wordcount.items(),key=lambda item:item[1],reverse=True)
#可视化词云
wordcloud = WordCloud(background_color="white",font_path="file:///C:/windows/Fonts/MSYH.TTC",
width=1000, height=860, margin=2).fit_words(wordcount)
plt.imshow(wordcloud)
plt.axis("off")
plt.savefig('分类_第'+str(k)+'类.png')
file = open('词频统计_分类'+str(k)+'.csv','w',encoding='gbk',errors='ignore')
for i in list(range(len(wordcount_sort))):
file.write(','.join((wordcount_sort[i][0],str(wordcount_sort[i][1])))+'\n')
file.close()
return 0
if __name__ == '__main__':
word,wordcount,wordcount_sort=fenci(data)
for k in range(K):
divide(word,k)