xiaodu_julei.py-20180830

# -*- coding: utf-8 -*-
"""
Created on Thu Aug 30 11:46:33 2018

@author: wenyun.wxw
"""

import time          
import re          
import os  
import sys
import codecs
import shutil
import numpy as np
from sklearn import feature_extraction  
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import CountVectorizer 
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans 
import pandas as pd 
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

#计算TFIDF
corpus = data
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
words = vectorizer.get_feature_names()
weight = tfidf.toarray()
 
#聚类Kmeans   
#---------选K------------------
#手肘法:画sse与K的图,拐点处为最佳K
SSE = []  # 存放每次结果的误差平方和
#K的上限为20
kmax=20
for k in range(1,kmax):
    estimator = KMeans(n_clusters=k,init='k-means++')  # 构造聚类器
    estimator.fit(weight)
    SSE.append(estimator.inertia_)
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(range(1,kmax),SSE,'o-')
plt.show()
#效果不好,看不出来拐点

#轮廓系数法,轮廓系数最大的k最佳
Scores = []  # 存放轮廓系数
for k in range(2,kmax):
    estimator = KMeans(n_clusters=k,init='k-means++')  # 构造聚类器
    estimator.fit(weight)
    Scores.append(silhouette_score(weight,estimator.labels_,metric='euclidean'))
plt.xlabel('k')
plt.ylabel('轮廓系数')
plt.plot(range(2,kmax),Scores,'o-')
plt.show()
#k=5时相对最大

#---------聚类------------------
K=7
clf = KMeans(n_clusters=K,init='k-means++')
s = clf.fit(weight)
#分类结果:每个样本所属的簇
label=clf.labels_
print(clf.inertia_)
 

file = open('分类结果.csv','w',encoding='gbk',errors='ignore')
for i in list(range(len(label))):
    file.write(','.join((str(label[i]),str(data[i])))+'\n')
file.close()  

#把每类分开
def divide(word,k):
    word_part=[]
    for i in range(len(label)):
        if label[i]==k:
            word_part.append(word[i])
    
    #合并所有词        
    wordall=[]
    for w in word_part:
        wordall.extend(w)
    
    #词频统计    
    wordcount= {} #字典
    for item in wordall:
      if item not in wordcount:
        wordcount[item] = 1
      else:
        wordcount[item] += 1

    #根据词频从高到低排序
    wordcount_sort=sorted(wordcount.items(),key=lambda item:item[1],reverse=True)
    
    #可视化词云
    wordcloud = WordCloud(background_color="white",font_path="file:///C:/windows/Fonts/MSYH.TTC",
               width=1000, height=860, margin=2).fit_words(wordcount)
    plt.imshow(wordcloud) 
    plt.axis("off")
    plt.savefig('分类_第'+str(k)+'类.png')
    
    file = open('词频统计_分类'+str(k)+'.csv','w',encoding='gbk',errors='ignore')
    for i in list(range(len(wordcount_sort))):
        file.write(','.join((wordcount_sort[i][0],str(wordcount_sort[i][1])))+'\n')
    file.close()
    
    return 0

if __name__ == '__main__':
    word,wordcount,wordcount_sort=fenci(data)
    for k in range(K):
        divide(word,k)

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值