def print_topic(texts, n_topics, n_words):
'''
输出主题模型结果
:param n_topics: LDA主题数量
:param n_words: 主题的词范围(数量)
:return: 输出公共主题,单日主题
'''
# 准备语料库/Vocab
vocabs = corpora.Dictionary(texts) # generate vocabulary dictionary
corpus = [vocabs.doc2bow(word) for word in texts] #为训练模型生成语料库
corpora.MmCorpus.serialize('weibo.mm', corpus)
# 训练LDA模型
# 设定主题个数:
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=vocabs, num_topics=n_topics, random_state=1)
# 展示训练结果
# lda.print_topics(num_topics=n_topics, num_words=n_words)
corpus_lda = lda[corpus]
main_topics = []
for i in range(len(texts)):
# print(corpus_lda[i])
# 选取单日"权重最大"的主题
main_topics.append(sorted(corpus_lda[i], key=lambda x: x[1], reverse=True)[0])
# 梳理主题结果:排除在每个主题列表中都出现的词;公共主题,单日主题
# 整理单日的主题
for t in range(0, n_topics):
name = 't' + str(t)
locals()['t' + str(t)] = []
for i in range(0, n_words):
locals()['t' + str(t)].append((lda.show_topic(t, n_words)[i])[0])
# 提取公共主题
p_topic = locals()['t' + str(0)]
for t in range(1, n_topics):
p_topic = list(set(p_topic).intersection(set(locals()['t' + str(t)])))
print("public topic:", p_topic)#打印公共主题
# 提取单日"独有的"主题
day_topics = []
for t in range(0, n_topics):
locals()["t" + str(t)] = list(set(locals()["t" + str(t)]).difference(set(p_topic)))
day_topics.append(locals()["t" + str(t)])
print(t, ":", locals()["t" + str(t)])#打印每日的主题
# 返回变量: 每一天权重最高的topic的数据
days_topic = []
for i in range(len(texts)):
main_topic = main_topics[i]
days_topic.append(day_topics[main_topic[0]])
return main_topics, p_topic, days_topic
07-06
07-05
“相关推荐”对你有帮助么?
-
非常没帮助
-
没帮助
-
一般
-
有帮助
-
非常有帮助
提交