作业1.2.0

Kviknar

已于 2023-05-03 21:24:18 修改

阅读量52

点赞数

文章标签： python 开发语言

于 2023-05-03 21:21:42 首次发布

本文链接：https://blog.csdn.net/m0_67899821/article/details/130477416

版权

该代码段展示了如何利用jieba进行中文分词，gensim构建LDA主题模型，以及生成词云来可视化文本数据中的主要话题。它首先读取TXT文件，处理编码问题，然后进行分词，过滤停用词，最后训练LDA模型并输出主题，同时生成每个主题的词云图。

摘要由CSDN通过智能技术生成

import jieba
import os
import chardet
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from wordcloud import WordCloud
import matplotlib.pyplot as plt

num_topics = 5
newsTextdir = r'D:\doc'

def getnewstext(newsdir):
    docs=[]
    news_text=""
    sd=os.walk(newsdir)
    for d,s,fns in sd:
        for fn in fns:
            if fn[ -3: ] == 'txt':
                file = d+os.sep+fn
                print(file)
                try:
                    f=open(file)
                    lines=f.readlines()
                except:
                    ft=open(file,"rb")
                    cs=chardet.detect(ft.read())
                    ft.close()
                    f=open(file,encoding=cs['encoding'])
                    lines=f.readlines()
                docs.append('\n'.join(lines))
    return docs

alllines=getnewstext(newsTextdir)

stoplist=open('stopword.txt','r',encoding="utf-8").readlines()
stoplist = set(w.strip() for w in stoplist)

segtexts=[]
for line in alllines:
    doc=[]
    for w in list(jieba.cut(line,cut_all=True)):
        if len(w)>1 and w not in stoplist:
            doc.append(w)
    segtexts.append(doc)

dictionary = Dictionary(segtexts)
dictionary.filter_extremes(2,1.0,keep_n=1000)
corpus = [dictionary.doc2bow(text) for text in segtexts]
lda = LdaModel(corpus,id2word=dictionary,num_topics=num_topics)

topics=lda.print_topics(num_topics = num_topics,num_words = 10)
print(topics)

font = r'C:\Windows\Fonts\simfang.ttf'
wc = WordCloud(collocations=False,font_path=font,width=2800,height=2800,max_words=20,margin=2)
for topicid in range(0,num_topics):
    tlist = lda.get_topic_terms(topicid,topn=1000)
    wdict={}
    for wv in tlist:
        wdict[ dictionary[wv[0]]] = wv[1]
    print(wdict)
    wordcloud = wc.generate_from_frequencies(wdict)
    wordcloud.to_file('topic_'+str(topicid)+'.png')