作业1.2.0

import jieba
import os
import chardet
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
from wordcloud import WordCloud
import matplotlib.pyplot as plt

num_topics = 5
newsTextdir = r'D:\doc'

def getnewstext(newsdir):
    docs=[]
    news_text=""
    sd=os.walk(newsdir)
    for d,s,fns in sd:
        for fn in fns:
            if fn[ -3: ] == 'txt':
                file = d+os.sep+fn
                print(file)
                try:
                    f=open(file)
                    lines=f.readlines()
                except:
                    ft=open(file,"rb")
                    cs=chardet.detect(ft.read())
                    ft.close()
                    f=open(file,encoding=cs['encoding'])
                    lines=f.readlines()
                docs.append('\n'.join(lines))
    return docs

alllines=getnewstext(newsTextdir)

stoplist=open('stopword.txt','r',encoding="utf-8").readlines()
stoplist = set(w.strip() for w in stoplist)

segtexts=[]
for line in alllines:
    doc=[]
    for w in list(jieba.cut(line,cut_all=True)):
        if len(w)>1 and w not in stoplist:
            doc.append(w)
    segtexts.append(doc)

dictionary = Dictionary(segtexts)
dictionary.filter_extremes(2,1.0,keep_n=1000)
corpus = [dictionary.doc2bow(text) for text in segtexts]
lda = LdaModel(corpus,id2word=dictionary,num_topics=num_topics)

topics=lda.print_topics(num_topics = num_topics,num_words = 10)
print(topics)

font = r'C:\Windows\Fonts\simfang.ttf'
wc = WordCloud(collocations=False,font_path=font,width=2800,height=2800,max_words=20,margin=2)
for topicid in range(0,num_topics):
    tlist = lda.get_topic_terms(topicid,topn=1000)
    wdict={}
    for wv in tlist:
        wdict[ dictionary[wv[0]]] = wv[1]
    print(wdict)
    wordcloud = wc.generate_from_frequencies(wdict)
    wordcloud.to_file('topic_'+str(topicid)+'.png')
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值