升级版K-means聚类:tf-idf+PCA降维+k-means,代码传送门:
# coding:utf-8
# 2.0 使用jieba进行分词,彻底放弃低效的NLPIR,用TextRank算法赋值权重(实测textrank效果更好)
# 2.1 用gensim搞tfidf
# 2.2 sklearn做tfidf和kmeans
# 2.3 将kmeans改成BIRCH,使用传统tfidf
import logging
import time
import os
import jieba
import glob
import random
import copy
import chardet
import gensim
import matplotlib.pyplot as plt
from gensim import corpora,similarities, models
from pprint import pprint
import jieba.analyse
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import os
from sklearn.decomposition import PCA
start = time.clock()
print( '#----------------------------------------#')
print( '# #')
print( '# 分词+去停用词 #')
print( '# #')
print( '#----------------------------------------#\n')
def DeleteStopWords(data, stopWords):
wordList = []
# 先分一下词
cutWords = jieba.cut(data)
for item in cutWords:
if item.encode('utf-8') not in stopWords: # 分词编码要和停用词编码一致
wordList.append(item)
return wordList
print( '#----------------------------------------#')
print( '# #')
print( '# tf-idf #')
print( '# #')
print( '#-------------------