data.txt文本是做了预处理等操作生成的数据,每一行代表一条数据:
in conjunction with the release of the the allen institute for ai partnered with
the recent outbreak of the deadly and highly infectious covid disease caused by
coronaviruses is related illness that vary from a common cold more severe
it is shown that the evaporation rate of a liquid sample containing the
covid illness an on going epidemic started in wuhan city china in december
in the beginning of december covid virus that slipped from animals humans in
建模代码:
from gensim import corpora
import gensim # pip install gensim
def get_topic(all_contents, num_topic=10):
# num_topic 定义LDA模型需要训练成多少类
try:
def lda_analyze(all_contents, num_topic=10):
"""这是训练LDA的核心方法"""
dictionary = corpora.Dictionary(all_contents)
corpus = [dictionary.doc2bow(sentence) for sentence in all_contents]
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topic) # 核心代码
return lda
# all_contents is list to list
lda = lda_analyze(all_contents, num_topic=num_topic)
for topic in lda.print_topics(num_words=20): # 这里是打印LDA分类的结果
print(topic[1])
# save model
lda.save('lda_' + str(num_topic) + '.model')
except Exception as e:
print(e)
# 整合data的核心代码
data = list(iter(open('data.txt')))
data = [content.split() for content in data]
for i in range(16):
get_topic(data, i + 1) # 从分为1个类别到16个类别,都跑一跑,然后把结果保存下来