# -*- coding: cp936 -*-
import jieba, os
from gensim import corpora, models, similarities
train_set = []
walk = os.walk('/home/8888/test1')
for root, dirs, files in walk:
for name in files:
f = open(os.path.join(root, name), 'r')
raw = f.read()
word_list = list(jieba.cut(raw, cut_all = False))
train_set.append(word_list)
dic = corpora.Dictionary(train_set)
corpus = [dic.doc2bow(text) for text in train_set]
tfidf = models.TfidfModel(corpus)#构造tfidf对象
corpus_tfidf = tfidf[corpus]#将tfidf应用于该语料库,计算该corpus的tfidf
lda = models.LdaModel(corpus_tfidf, id2word = dic, num_topics = 10)#构造lda对象
corpus_lda = lda[corpus_tfidf]#将lda应用到该语料库,计算该语料的lda 此处尚未考虑stopword的问题
for i in range(0, 10):
print lda.print_topic(i)
import jieba, os
from gensim import corpora, models, similarities
train_set = []
walk = os.walk('/home/8888/test1')
for root, dirs, files in walk:
for name in files:
f = open(os.path.join(root, name), 'r')
raw = f.read()
word_list = list(jieba.cut(raw, cut_all = False))
train_set.append(word_list)
dic = corpora.Dictionary(train_set)
corpus = [dic.doc2bow(text) for text in train_set]
tfidf = models.TfidfModel(corpus)#构造tfidf对象
corpus_tfidf = tfidf[corpus]#将tfidf应用于该语料库,计算该corpus的tfidf
lda = models.LdaModel(corpus_tfidf, id2word = dic, num_topics = 10)#构造lda对象
corpus_lda = lda[corpus_tfidf]#将lda应用到该语料库,计算该语料的lda 此处尚未考虑stopword的问题
for i in range(0, 10):
print lda.print_topic(i)