1.单个文本的主题建模
import jieba
from gensim import corpora, models
import pyLDAvis.gensim
# 读取文本文件
with open('topic1.txt', 'r', encoding='utf-8') as file:
text = file.read()
# 分词
seg_list = jieba.cut(text)
tokens = [token for token in seg_list if len(token) > 1]
# 创建语料库
dictionary = corpora.Dictionary([tokens])
corpus = [dictionary.doc2bow(tokens)]
# 构建LDA模型
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary)
# 准备数据并生成可视化对象
data = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
# 保存可视化结果为HTML文件
pyLDAvis.save_html(data, 'visualization1.html')
2.多个文本的主题建模
import jieba
from gensim import corpora, models
import pyLDAvis.gensim
# 读取多个文本文件
file_paths = ['1-1-1.txt', '1-1-2.txt', '1-1-3.txt'] # 假设有三个文本文件
texts = []
for file_path in file_paths:
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
texts.append(text)
# 分词
tokenized_texts = []
for text in texts:
seg_list = jieba.cut(text)
tokens = [token for token in seg_list if len(token) > 1]
tokenized_texts.append(tokens)
# 创建语料库
dictionary = corpora.Dictionary(tokenized_texts)
corpus = [dictionary.doc2bow(tokens) for tokens in tokenized_texts]
# 构建LDA模型
lda_model = models.LdaModel(corpus, num_topics=5, id2word=dictionary)
# 准备数据并生成可视化对象
data = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
# 显示可视化结果
pyLDAvis.save_html(data, 'visualization1-2-3.html')