1、所用的文档格式.txt
这里用的是已经分词好的txt文本文档
2、LDA主题建模
将文本文档转化为列表,然后构建词典,语义向量化表示。
import gensim
from gensim import corpora
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import warnings
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
# 将文档转化为列表
f = open("D:\\data\\1\\lda_江苏.txt",'r',encoding='utf-8')
content = f.read()
my_content = content.split('\n')
#print(my_content)
#print(type(my_content))
# 构建词典,语料向量化表示
dictionary = corpora.Dictionary([my_content]) # 构建词典
corpus = [dictionary.doc2bow(text) for text in [my_content]]
ldamodel = LdaModel(corpus, num_topics=5, id2word = dictionary, passes=10) #分为5个主题
for topic_id in range(ldamodel.num_topics):
print(f