lda Python代码

import chardet
import jieba
import gensim
import codecs
import time

S1 = []
S2 = []
with open('C:\\Users\\28612\\Desktop\\news_sohusite_xml.dat', 'rb') as f:
    lines = f.readlines()
    ans = 0
    for line in lines:
        s = line.decode('GB2312', "ignore")
        if ans % 6 == 3:
            S1.append(s[14:-16])
        elif ans % 6 == 4:
            S2.append(s[9:-11])
        ans = ans + 1
        if ans / 6 == 10000:
           break

doc_set = [s for s in S2 if s != '']
print(len(doc_set))

with open('C:\\Users\\28612\\Desktop\\stopwords.txt', 'r') as f:
    stopwords = {}
    for w in f.readlines():
        #stopwords.append(w.strip('\n'))
        stopwords[w.strip('\n')]=1
        #stopwords.add(w.strip('\n'))

time_start = time.time()
texts = []
for i in doc_set:
    doc = list(jieba.cut(i))
    texts.append([w for w in doc if w not in stopwords])
time_end = time.time()
print('去停用词:', time_end - time_start)

"""
with codecs.open('C:\\Users\\28612\\Desktop\\data', 'w', encoding='GB2312') as f:
    for text in texts:
        f.write(str(len(text)) + '\n')
        print(str(len(text)))
        for w in text:
            f.write(w + ' ')
            print(w,end=' ')
        f.write('\n')
        print()
"""

dictionary = gensim.corpora.Dictionary(texts)

corpus = [dictionary.doc2bow(text) for text in texts]

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=18, id2word=dictionary, passes=10)

for top in ldamodel.print_topics():
    print(top)


"""
doc_test = []
with open('C:\\Users\\28612\\Desktop\\news_sohusite_xml.dat', 'rb') as f:
    lines = f.readlines()
    ans = 0
    for line in lines:
        if ans/6<10000+5:
            ans = ans + 1
            continue
        s = line.decode('GB2312', "ignore")
        if ans % 6 == 4 and s!='':
            doc_test.append(s[9:-11])
            print(doc_test[-1])
            if(len(doc_test)>5):
                break;
        ans = ans + 1

texts_test = []
for i in doc_test:
    doc = list(jieba.cut(i))
    texts_test.append([w for w in doc if w not in stopwords])

corpus_test = [dictionary.doc2bow(text) for text in texts_test]
topics_test = ldamodel.get_document_topics(corpus_test)
for top in topics_test:
    print(top)
"""
  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值