import numpy as np
import jieba
with open('十九大报告.txt','r') as f:
docment=[]
docment.append(f.read())
result = []
for doc in docment:
document3_cut = jieba.cut(doc)
result.append(' '.join(document3_cut))
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
corpus = result
cntVector = CountVectorizer()
cntTf = cntVector.fit_transform(corpus)
vocs = cntVector.get_feature_names()
print('主题词袋:', len(vocs))
print(vocs)
lda = LatentDirichletAllocation(n_components=2,
learning_offset=50.,
random_state=0)
docres = lda.fit_transform(cntTf)
LDA_corpus = np.array(docres)
print('类别所属概率:\n', LDA_corpus)
LDA_corpus_one = np.zeros([LDA_corpus.shape[0]])
LDA_corpus_one = np.argmax(LDA_corpus, axis=1)
print('每个文档所属类别:', LDA_corpus_one)
tt_matrix = lda.components_
id = 0
for tt_m in tt_matrix:
tt_dict = [(name, tt) for name, tt in zip(vocs, tt_m)]
tt_dict = sorted(tt_dict, key=lambda x: x[1], reverse=True)
tt_dict = tt_dict[:8]
print('主题%d:' % (id), tt_dict)
id += 1