import chardet
import jieba
import gensim
import codecs
import time
S1 = []
S2 = []
with open('C:\\Users\\28612\\Desktop\\news_sohusite_xml.dat', 'rb') as f:
lines = f.readlines()
ans = 0
for line in lines:
s = line.decode('GB2312', "ignore")
if ans % 6 == 3:
S1.append(s[14:-16])
elif ans % 6 == 4:
S2.append(s[9:-11])
ans = ans + 1
if ans / 6 == 10000:
break
doc_set = [s for s in S2 if s != '']
print(len(doc_set))
with open('C:\\Users\\28612\\Desktop\\stopwords.txt', 'r') as f:
stopwords = {}
for w in f.readlines():
#stopwords.append(w.strip('\n'))
stopwords[w.strip('\n')]=1
#stopwords.add(w.strip('\n'))
time_start = time.time()
texts = []
for i in doc_set:
doc = list(jieba.cut(i))
texts.append([w for w in doc if w not in stopwords])
time_end = time.time()
print('去停用词:', time_end - time_start)
"""
with codecs.open('C:\\Users\\28612\\Desktop\\data', 'w', encoding='GB2312') as f:
for text in texts:
f.write(str(len(text)) + '\n')
print(str(len(text)))
for w in text:
f.write(w + ' ')
print(w,end=' ')
f.write('\n')
print()
"""
dictionary = gensim.corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=18, id2word=dictionary, passes=10)
for top in ldamodel.print_topics():
print(top)
"""
doc_test = []
with open('C:\\Users\\28612\\Desktop\\news_sohusite_xml.dat', 'rb') as f:
lines = f.readlines()
ans = 0
for line in lines:
if ans/6<10000+5:
ans = ans + 1
continue
s = line.decode('GB2312', "ignore")
if ans % 6 == 4 and s!='':
doc_test.append(s[9:-11])
print(doc_test[-1])
if(len(doc_test)>5):
break;
ans = ans + 1
texts_test = []
for i in doc_test:
doc = list(jieba.cut(i))
texts_test.append([w for w in doc if w not in stopwords])
corpus_test = [dictionary.doc2bow(text) for text in texts_test]
topics_test = ldamodel.get_document_topics(corpus_test)
for top in topics_test:
print(top)
"""
lda Python代码
最新推荐文章于 2024-05-17 08:57:45 发布