from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
filepaths = []
#循环将文件夹中的TXT全部导入
import os
rootdir = 'F:/GEV/lda_dir/referenceData'
list = os.listdir(rootdir) #列出文件夹下所有的目录与文件
for i in range(0,len(list)):
path = os.path.join(rootdir,list[i])
if os.path.isfile(path):
filepaths.append(path)
docs = [open(f, 'r', encoding = 'utf-8').read() for f in filepaths]
docs = [word_tokenize(doc) for doc in docs]
stopWords = set(stopwords.words('english'))
docs = [[w for w in doc if w.lower() not in stopWords] for doc in docs]
from gensim import corpora
from gensim.models import LdaModel
from gensim import models
from gensim.corpora import Dictionary
dictionary = corpora.Dictionary(docs)
corpus = [ dictionary.doc2bow(text) for text in docs ]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
#########Run the LDA model for XX topics #########################
LDA python 用Gensim包处理LDA主题模型
最新推荐文章于 2024-04-28 13:49:58 发布