lda Python代码

最新推荐文章于 2024-05-17 08:57:45 发布

Forever_Young_L

最新推荐文章于 2024-05-17 08:57:45 发布

阅读量555

点赞数 1

分类专栏： nlp

本文链接：https://blog.csdn.net/qq_41648012/article/details/104065787

版权

nlp 专栏收录该内容

6 篇文章 1 订阅

订阅专栏

import chardet
import jieba
import gensim
import codecs
import time

S1 = []
S2 = []
with open('C:\\Users\\28612\\Desktop\\news_sohusite_xml.dat', 'rb') as f:
    lines = f.readlines()
    ans = 0
    for line in lines:
        s = line.decode('GB2312', "ignore")
        if ans % 6 == 3:
            S1.append(s[14:-16])
        elif ans % 6 == 4:
            S2.append(s[9:-11])
        ans = ans + 1
        if ans / 6 == 10000:
           break

doc_set = [s for s in S2 if s != '']
print(len(doc_set))

with open('C:\\Users\\28612\\Desktop\\stopwords.txt', 'r') as f:
    stopwords = {}
    for w in f.readlines():
        #stopwords.append(w.strip('\n'))
        stopwords[w.strip('\n')]=1
        #stopwords.add(w.strip('\n'))

time_start = time.time()
texts = []
for i in doc_set:
    doc = list(jieba.cut(i))
    texts.append([w for w in doc if w not in stopwords])
time_end = time.time()
print('去停用词：', time_end - time_start)

"""
with codecs.open('C:\\Users\\28612\\Desktop\\data', 'w', encoding='GB2312') as f:
    for text in texts:
        f.write(str(len(text)) + '\n')
        print(str(len(text)))
        for w in text:
            f.write(w + ' ')
            print(w,end=' ')
        f.write('\n')
        print()
"""

dictionary = gensim.corpora.Dictionary(texts)

corpus = [dictionary.doc2bow(text) for text in texts]

ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=18, id2word=dictionary, passes=10)

for top in ldamodel.print_topics():
    print(top)


"""
doc_test = []
with open('C:\\Users\\28612\\Desktop\\news_sohusite_xml.dat', 'rb') as f:
    lines = f.readlines()
    ans = 0
    for line in lines:
        if ans/6<10000+5:
            ans = ans + 1
            continue
        s = line.decode('GB2312', "ignore")
        if ans % 6 == 4 and s!='':
            doc_test.append(s[9:-11])
            print(doc_test[-1])
            if(len(doc_test)>5):
                break;
        ans = ans + 1

texts_test = []
for i in doc_test:
    doc = list(jieba.cut(i))
    texts_test.append([w for w in doc if w not in stopwords])

corpus_test = [dictionary.doc2bow(text) for text in texts_test]
topics_test = ldamodel.get_document_topics(corpus_test)
for top in topics_test:
    print(top)
"""

Forever_Young_L

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
2
评论
lda Python代码

import chardetimport jiebaimport gensimS1 = []S2 = []with open('C:\\Users\\28612\\Desktop\\news_sohusite_xml.dat', 'rb') as f: lines = f.readlines() ans = 0 for line in lines: ...
复制链接

扫一扫