构造词典时,输入的不应为原始文档集合,而是字符数组的数组。
import nltk
from gensim.models.ldamulticore import LdaModel
from gensim.corpora.dictionary import Dictionary
from gensim.test.utils import datapath
class LDA():
def __init__(self):
super(LDA, self).__init__()
def getDataArray(self,src):
file = open(src, 'r', encoding='UTF-8')
dataset = [] # Gensim输入数据
count=0
for text in file.readlines():
print(count)
# corpus=[dictory.doc2bow(nltk.tokenize(text)) for text in file.readlines()]
tokens = nltk.word_tokenize(text)
dataset.append(tokens)
count+=1
print('Get data array complete!')
return dataset
if __name__=="__main__":
lda=LDA()
path=你的文件路径
dicFile=你的词典路径
dataset=lda.getDataArray(path)
dictory=Dictionary(dataset)
dictory.save_as_text(dicFile)#词典保存为文件
corpus=[dictory.doc2bow(text) for text in dataset]#转换为数字表示
# 利用处理好的语料训练模型
lda = LdaModel(corpus, num_topics=5,alpha='auto',eval_every=5)
#save model
temp_file=datapath('model')
lda.save(temp_file)