# -*- coding: utf-8 -*-
"""
Created on Wed Apr 18 11:56:02 2018
@author: NAU
"""
#导入包
import random
import sys
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
#词频tfidf权重计算
corpus = []
tfidfdict = {}
seg_ty = open('E:\\seg_ty.txt', 'r') #读取一行语料作为一个文档
tfidf_ty_word = open('E:\\tfidf_ty_word.txt', 'w') #tfidf后的文本存储路径
tfidf_ty_result = open('E:\\tfidf_ty_result.txt', 'w')
cluster_result = open("E:\\cluster_result.txt", 'w')
for line in seg_ty:
corpus.append(line.strip())
vectorizer=CountVectorizer() #该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
transformer=TfidfTransformer() #该类会统计每个词语的tf-idf权值
tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus)) #第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
word=vectorizer.get_feature_names() #获取词袋模型中的所有词语
weight=tfidf.toarray()
for j in range(len(word)):
tfidf_ty_word.write(word[j] + ' ')
tfidf_ty_word.write('\r\n\r\n')
for i in range(len(weight)):
print ("这是第",i,"类文本的词语tfidf权重.")
for j in range(len(word)):
getword = word[j]
getvalue = weight[i][j]
tfidf_ty_result.write(str(weight[i][j]) + ' ')
tfidf_ty_result.write('\r\n\r\n')
#Kmeans聚类算法
clf = KMeans(n_clusters = 20)
s = clf.fit(weight)
print(clf.cluster_centers_) #20个中心
labels = []
print(clf.labels_) #每个样本所属的簇
i =1
while i<= len(clf.labels_):
print (i, clf.labels_[i-1])
i = i + 1
print(clf.inertia_) #用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数
cluster_result.write(str(clf.inertia_) + ' ')
#PCA降维
from sklearn.decomposition import PCA #导入包
pca = PCA(n_components=5) #指定维度
newData = pca.fit_transform(weight) #加载Tfidf权重数据并降维
print (newData)