中文文本分析(matplotlib的库的应用)

中文文本分析(matplotlib的库的应用)

Request:
1.结合wordcloud将《红楼梦》、《水浒传》、《三国演义》分别绘制人物的词云图(按照人物出现的频率)
2.分别统计《红楼梦》、《水浒传》、《三国演义》前20个主要人物的出场次数,并绘制出场次数的统计图
3.结合networkx绘制《红楼梦》、《水浒传》、《三国演义》主要人物的社交关系网络图

首先介绍一下代码来源,生成词云,绘制关系网络图参考书本《Python语言程序设计》(上海交通大学出版社),我主要写的是加了一个类的模块和GUI实现,有许多注释操作,没有删除,但是都有一点的意义(本来是想先用plt保存图片,后来懒得搞了,直接用plt里面的界面显示,所以有很多存储操作被我注释掉了)
上代码`import re
import jieba
import wordcloud
import matplotlib.pyplot as plt
from imageio import imread
import numpy as np
import networkx as nx
import matplotlib
import tkinter as tk

import os

matplotlib.rcParams[‘font.sans-serif’]=[‘SimHei’]
class TextAnalysisAPP:
‘’‘根据接收的路径名打开并分析文本’’’
count=20
time=[]
name=[]
wordFreq_fpath=’’ #词频文件路径
wordFreq_pc_path=’’#词云图片路径名
wordFreq_statistics_path=’’ #统计图片文件路径
def init(self,filepath,needname_path):
self.filepath=filepath
self.needname_path=needname_path

def needwordslist(self):   #读取need文件
    needwords=[line.strip() for line in open(self.needname_path,'r'
    ).readlines()]
    needwords=[i for i in needwords if(len(str(i))!=0)]
    return needwords   #返回词列表

def getText(self):   #读取文本数据,返回字符串类型
    with open(self.filepath,'r',encoding='utf-8') as f:
        text=f.read()
    return text         #返回文本所有字符

def wordFreq(self):#使用jieba库读取并切割,并统计词频
    if True:
        
        self.time=[]
        self. name=[]
        words=jieba.lcut(self.getText().strip())        #函数返回字符串
        counts={}
        needwords=self.needwordslist()   #读取需要名字的列表
        for word in words:
            if len(word)==1:
                continue
            elif word  in needwords:
                counts[word]=counts.get(word,0)+1
        items=list(counts.items())  #转换成元组列表排序
        items.sort(key=lambda x:x[1],reverse=True)
        name=re.split(r'[/\\.]',self.filepath)[-2]
# if not os.path.exists(filepath[2:5]+'_词频.txt'):
#     os.mkdir(filepath[2:5]+'_词频.txt')
        with open(name+'_词频.txt','w') as f:
            for i in range(self.count):   #topn为需要展示的人物数量
                word,count=items[i]
                f.writelines('{}\t{}\n'.format(word, count))
                self.name.append(word)
                self.time.append(count)
        wordFreq_fpath=''
        wordFreq_fpath=name+'_词频.txt'
        self.wordFreq_fpath=wordFreq_fpath
        return wordFreq_fpath      #返回词频文件名
    else:return self.wordFreq_fpath

def wordcloud_pc(self):   #展示词云
    if True:
        bg_pic=imread('.\六边形.png')  #用这个库貌似读取不了图片
# bg_pic=np.array(Image.open('pc_cloud.jpg'))
        filepath=self.wordFreq()   #返回词频文件名
        with open(filepath,'r') as f:
            text=f.read()              #generate会自动将text中后面整数作为频率
        wcloud=wordcloud.WordCloud(background_color='white',
        font_path=r"C:\Windows\Fonts\simhei.ttf",mask=bg_pic,width=1000,height=860,
        max_words=1000,margin=2).generate(text)#mask为词云形状,margin为每个单词间隔
        # wcloud.to_file(re.split(r'[/\\.]',self.filepath)[-2]+'.png')
        # self.wordFreq_pc_path=re.split(r'[/\\.]',self.filepath)[-2]+'.jpg'
    # else:return self.wordFreq_pc_path
    plt.imshow(wcloud)
    plt.axis('off')
    plt.show()

def WordFreqAnalysis(self):
    if True:
        Xi=np.array(self.time)
        Yi=np.array(self.name)
        x=np.arange(0,self.count)
        width=0.6
        plt.rcParams['font.sans-serif']=['SimHei']  #用正常显示中文
        plt.figure(figsize=(8,6))#图像比例
        plt.barh(x,Xi,width,color='SkyBlue',alpha=0.8)#横向的图,alpha为柱体透明度
        plt.xlabel('time')
        plt.ylabel('name')
        for a,b,c in zip(Xi,Yi,x):
            # print(a,b,c)
            plt.text(a+10,c-0.4,'%d'%int(a),ha='center',va='bottom')#ha为垂直对齐方式,va为水平对齐方式
        plt.yticks(x,Yi)  #Yi为每个人物名称
        # name=re.split(r'[/\\.]',self.filepath)[-2]
        # plt.savefig(name+'_statistics.jpg')
        # self.wordFreq_statistics_path=name+'_statistics.jpg'
        plt.show()
        # return self.wordFreq_statistics_path
    # else:
    #     plt.show()
    #     return self.wordFreq_statistics_path
    # plt.show()
    # plt.close
def CharacterRelation(self):
    Names=self.needwordslist()  #需要的名字列表
    relations={}
    s=self.getText()
    lst_para=s.split('\n')
    for text in lst_para:
        for name1 in Names:
            if name1 in text:
                for name2 in Names:
                    if name2 in text and name1!=name2 and (name2,name1)not in relations:
                        relations[(name1,name2)]=relations.get((name1,name2),0)+1
    # print(relations.items())
    maxRela=max([v for k,v in relations.items()])
    relations={k:v/maxRela for k,v in relations.items()}
    relations={k:v for k,v in relations.items() if v>0.2}
    # print(relations.items())
    plt.figure(figsize=(15,15))
    G=nx.Graph()
    for k,v in relations.items():
        G.add_edge(k[0],k[1],weight=v)
    elarge=[(u,v) for (u,v,d) in G.edges(data=True) 
              if(d['weight']>0.6)]
    emidle=[(u,v) for (u,v,d) in G.edges(data=True) 
              if(d['weight']<=0.6)&(d['weight']>0.3)]
    esmall=[(u,v) for (u,v,d) in G.edges(data=True) 
              if(d['weight']<=0.3)]
    pos=nx.spring_layout(G)
    nx.draw_networkx_nodes(G, pos, alpha=0.8,node_size=800)
    nx.draw_networkx_edges(G, pos, edgelist=elarge,width=2.5,alpha=0.9,edge_color='g')
    nx.draw_networkx_edges(G, pos, edgelist=emidle,width=1.5,alpha=0.6,edge_color='y')
    nx.draw_networkx_edges(G, pos, edgelist=esmall,width=1,alpha=0.4,edge_color='b',style='dashed') 
    nx.draw_networkx_labels(G, pos, font_size=12)
    # plt.savefig(re.split(r'[/\\.]',self.filepath)[-2]+'_关系.jpg')
    plt.show()
    # plt.close()

class GUIAPP:
def init(self):
self.root=None

def operate(self):
    # self.recover_root()
    # frame=tk.Frame(self.root)
    # frame.pack()
    self.root=tk.Tk()
    self.root.title("GUI的测试窗口")
    self.root.geometry("300x400+500+100")
    tk.Button(self.root,text='红楼梦',command=lambda :self.first_button('红楼梦')).pack()
    tk.Button(self.root,text='水浒传',command=lambda :self.first_button('水浒传')).pack()
    tk.Button(self.root,text='三国演义',command=lambda :self.first_button('三国演义')).pack()
    self.root.mainloop()
    
    
def first_button(self,name):
    
    self.root.destroy()
    # self.recover_root()
    self.root=tk.Tk()
    self.root.title("GUI的测试窗口")
    self.root.geometry("300x400+500+100")
    # root1 =tk.Tk()
    # root1.title("GUI的测试窗口")
    # root1.geometry("300x400+150+0")
    frame=tk.Frame(self.root)
    frame.pack()
    tk.Button(frame,text=name+'词云',command=lambda :self.wordcloud_button(name)).pack()
    tk.Button(frame,text=name+'人物统计',command=lambda :self.statistics_button(name)).pack()
    tk.Button(frame,text=name+'人物关系',command=lambda :self.relations_button(name)).pack()
    tk.Button(frame,text='返回',command=lambda :self.return_button(name)).pack()
    self.root.mainloop()
    
    
def wordcloud_button(self,name):
    
    filepath='.\文本汇总\{}.txt'.format(name)
    needname_path='.\人物汇总\{}人物.txt'.format(name)
    sanguo=TextAnalysisAPP(filepath,needname_path)
    sanguo.wordcloud_pc()
def statistics_button(self,name):
    filepath='.\文本汇总\{}.txt'.format(name)
    needname_path='.\人物汇总\{}人物.txt'.format(name)
    sanguo=TextAnalysisAPP(filepath,needname_path)
    sanguo.wordFreq()   #返回词频文件名
    sanguo.WordFreqAnalysis()  #词频分析,返回文件名
def relations_button(self,name):
    filepath='.\文本汇总\{}.txt'.format(name)
    needname_path='.\人物汇总\{}人物.txt'.format(name)
    sanguo=TextAnalysisAPP(filepath,needname_path)
    sanguo.CharacterRelation()
def return_button(self,name):
    self.root.destroy()
    self.operate()

def main():
# filepath=’.\文本汇总\红楼梦.txt’
# needname_path=’.\人物汇总\红楼梦人物.txt’
# sanguo=TextAnalysisAPP(filepath,needname_path)
# ‘’‘词云显示’’’
# sanguo.wordcloud_pc()
# ‘’‘词频统计分析’’’
# sanguo.wordFreq() #返回词频文件名
# sanguo.WordFreqAnalysis() #词频分析,返回文件名
# print(sanguo.wordFreq_fpath,sanguo.wordFreq_pc_path,sanguo.wordFreq_statistics_path)
# ‘’‘人物关系分析’’’
# sanguo.CharacterRelation()
# Names=sanguo.needwordslist() #需要的名字列表
# print(Names)
# root =tk.Tk()
# root.title(“GUI的测试窗口”)
# root.geometry(“300x400+150+0”)
gui=GUIAPP()
gui.operate()
if name==‘main’:
main()

最后附上运行视频

  • 0
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值