import jieba import os import chardet from gensim.corpora.dictionary import Dictionary from gensim.models.ldamodel import LdaModel from wordcloud import WordCloud import matplotlib.pyplot as plt num_topics = 5 newsTextdir = r'D:\doc' def getnewstext(newsdir): docs=[] news_text="" sd=os.walk(newsdir) for d,s,fns in sd: for fn in fns: if fn[ -3: ] == 'txt': file = d+os.sep+fn print(file) try: f=open(file) lines=f.readlines() except: ft=open(file,"rb") cs=chardet.detect(ft.read()) ft.close() f=open(file,encoding=cs['encoding']) lines=f.readlines() docs.append('\n'.join(lines)) return docs alllines=getnewstext(newsTextdir) stoplist=open('stopword.txt','r',encoding="utf-8").readlines() stoplist = set(w.strip() for w in stoplist) segtexts=[] for line in alllines: doc=[] for w in list(jieba.cut(line,cut_all=True)): if len(w)>1 and w not in stoplist: doc.append(w) segtexts.append(doc) dictionary = Dictionary(segtexts) dictionary.filter_extremes(2,1.0,keep_n=1000) corpus = [dictionary.doc2bow(text) for text in segtexts] lda = LdaModel(corpus,id2word=dictionary,num_topics=num_topics) topics=lda.print_topics(num_topics = num_topics,num_words = 10) print(topics) font = r'C:\Windows\Fonts\simfang.ttf' wc = WordCloud(collocations=False,font_path=font,width=2800,height=2800,max_words=20,margin=2) for topicid in range(0,num_topics): tlist = lda.get_topic_terms(topicid,topn=1000) wdict={} for wv in tlist: wdict[ dictionary[wv[0]]] = wv[1] print(wdict) wordcloud = wc.generate_from_frequencies(wdict) wordcloud.to_file('topic_'+str(topicid)+'.png')
作业1.2.0
于 2023-05-03 21:21:42 首次发布
该代码段展示了如何利用jieba进行中文分词,gensim构建LDA主题模型,以及生成词云来可视化文本数据中的主要话题。它首先读取TXT文件,处理编码问题,然后进行分词,过滤停用词,最后训练LDA模型并输出主题,同时生成每个主题的词云图。
摘要由CSDN通过智能技术生成