中文文本分析(matplotlib的库的应用)
Request:
1.结合wordcloud将《红楼梦》、《水浒传》、《三国演义》分别绘制人物的词云图(按照人物出现的频率)
2.分别统计《红楼梦》、《水浒传》、《三国演义》前20个主要人物的出场次数,并绘制出场次数的统计图
3.结合networkx绘制《红楼梦》、《水浒传》、《三国演义》主要人物的社交关系网络图
首先介绍一下代码来源,生成词云,绘制关系网络图参考书本《Python语言程序设计》(上海交通大学出版社),我主要写的是加了一个类的模块和GUI实现,有许多注释操作,没有删除,但是都有一点的意义(本来是想先用plt保存图片,后来懒得搞了,直接用plt里面的界面显示,所以有很多存储操作被我注释掉了)
上代码`import re
import jieba
import wordcloud
import matplotlib.pyplot as plt
from imageio import imread
import numpy as np
import networkx as nx
import matplotlib
import tkinter as tk
import os
matplotlib.rcParams[‘font.sans-serif’]=[‘SimHei’]
class TextAnalysisAPP:
‘’‘根据接收的路径名打开并分析文本’’’
count=20
time=[]
name=[]
wordFreq_fpath=’’ #词频文件路径
wordFreq_pc_path=’’#词云图片路径名
wordFreq_statistics_path=’’ #统计图片文件路径
def init(self,filepath,needname_path):
self.filepath=filepath
self.needname_path=needname_path
def needwordslist(self): #读取need文件
needwords=[line.strip() for line in open(self.needname_path,'r'
).readlines()]
needwords=[i for i in needwords if(len(str(i))!=0)]
return needwords #返回词列表
def getText(self): #读取文本数据,返回字符串类型
with open(self.filepath,'r',encoding='utf-8') as f:
text=f.read()
return text #返回文本所有字符
def wordFreq(self):#使用jieba库读取并切割,并统计词频
if True:
self.time=[]
self. name=[]
words=jieba.lcut(self.getText().strip()) #函数返回字符串
counts={}
needwords=self.needwordslist() #读取需要名字的列表
for word in words:
if len(word)==1:
continue
elif word in needwords:
counts[word]=counts.get(word,0)+1
items=list(counts.items()) #转换成元组列表排序
items.sort(key=lambda x:x[1],reverse=True)
name=re.split(r'[/\\.]',self.filepath)[-2]
# if not os.path.exists(filepath[2:5]+'_词频.txt'):
# os.mkdir(filepath[2:5]+'_词频.txt')
with open(name+'_词频.txt','w') as f:
for i in range(self.count): #topn为需要展示的人物数量
word,count=items[i]
f.writelines('{}\t{}\n'.format(word, count))
self.name.append(word)
self.time.append(count)
wordFreq_fpath=''
wordFreq_fpath=name+'_词频.txt'
self.wordFreq_fpath=wordFreq_fpath
return wordFreq_fpath #返回词频文件名
else:return self.wordFreq_fpath
def wordcloud_pc(self): #展示词云
if True:
bg_pic=imread('.\六边形.png') #用这个库貌似读取不了图片
# bg_pic=np.array(Image.open('pc_cloud.jpg'))
filepath=self.wordFreq() #返回词频文件名
with open(filepath,'r') as f:
text=f.read() #generate会自动将text中后面整数作为频率
wcloud=wordcloud.WordCloud(background_color='white',
font_path=r"C:\Windows\Fonts\simhei.ttf",mask=bg_pic,width=1000,height=860,
max_words=1000,margin=2).generate(text)#mask为词云形状,margin为每个单词间隔
# wcloud.to_file(re.split(r'[/\\.]',self.filepath)[-2]+'.png')
# self.wordFreq_pc_path=re.split(r'[/\\.]',self.filepath)[-2]+'.jpg'
# else:return self.wordFreq_pc_path
plt.imshow(wcloud)
plt.axis('off')
plt.show()
def WordFreqAnalysis(self):
if True:
Xi=np.array(self.time)
Yi=np.array(self.name)
x=np.arange(0,self.count)
width=0.6
plt.rcParams['font.sans-serif']=['SimHei'] #用正常显示中文
plt.figure(figsize=(8,6))#图像比例
plt.barh(x,Xi,width,color='SkyBlue',alpha=0.8)#横向的图,alpha为柱体透明度
plt.xlabel('time')
plt.ylabel('name')
for a,b,c in zip(Xi,Yi,x):
# print(a,b,c)
plt.text(a+10,c-0.4,'%d'%int(a),ha='center',va='bottom')#ha为垂直对齐方式,va为水平对齐方式
plt.yticks(x,Yi) #Yi为每个人物名称
# name=re.split(r'[/\\.]',self.filepath)[-2]
# plt.savefig(name+'_statistics.jpg')
# self.wordFreq_statistics_path=name+'_statistics.jpg'
plt.show()
# return self.wordFreq_statistics_path
# else:
# plt.show()
# return self.wordFreq_statistics_path
# plt.show()
# plt.close
def CharacterRelation(self):
Names=self.needwordslist() #需要的名字列表
relations={}
s=self.getText()
lst_para=s.split('\n')
for text in lst_para:
for name1 in Names:
if name1 in text:
for name2 in Names:
if name2 in text and name1!=name2 and (name2,name1)not in relations:
relations[(name1,name2)]=relations.get((name1,name2),0)+1
# print(relations.items())
maxRela=max([v for k,v in relations.items()])
relations={k:v/maxRela for k,v in relations.items()}
relations={k:v for k,v in relations.items() if v>0.2}
# print(relations.items())
plt.figure(figsize=(15,15))
G=nx.Graph()
for k,v in relations.items():
G.add_edge(k[0],k[1],weight=v)
elarge=[(u,v) for (u,v,d) in G.edges(data=True)
if(d['weight']>0.6)]
emidle=[(u,v) for (u,v,d) in G.edges(data=True)
if(d['weight']<=0.6)&(d['weight']>0.3)]
esmall=[(u,v) for (u,v,d) in G.edges(data=True)
if(d['weight']<=0.3)]
pos=nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, alpha=0.8,node_size=800)
nx.draw_networkx_edges(G, pos, edgelist=elarge,width=2.5,alpha=0.9,edge_color='g')
nx.draw_networkx_edges(G, pos, edgelist=emidle,width=1.5,alpha=0.6,edge_color='y')
nx.draw_networkx_edges(G, pos, edgelist=esmall,width=1,alpha=0.4,edge_color='b',style='dashed')
nx.draw_networkx_labels(G, pos, font_size=12)
# plt.savefig(re.split(r'[/\\.]',self.filepath)[-2]+'_关系.jpg')
plt.show()
# plt.close()
class GUIAPP:
def init(self):
self.root=None
def operate(self):
# self.recover_root()
# frame=tk.Frame(self.root)
# frame.pack()
self.root=tk.Tk()
self.root.title("GUI的测试窗口")
self.root.geometry("300x400+500+100")
tk.Button(self.root,text='红楼梦',command=lambda :self.first_button('红楼梦')).pack()
tk.Button(self.root,text='水浒传',command=lambda :self.first_button('水浒传')).pack()
tk.Button(self.root,text='三国演义',command=lambda :self.first_button('三国演义')).pack()
self.root.mainloop()
def first_button(self,name):
self.root.destroy()
# self.recover_root()
self.root=tk.Tk()
self.root.title("GUI的测试窗口")
self.root.geometry("300x400+500+100")
# root1 =tk.Tk()
# root1.title("GUI的测试窗口")
# root1.geometry("300x400+150+0")
frame=tk.Frame(self.root)
frame.pack()
tk.Button(frame,text=name+'词云',command=lambda :self.wordcloud_button(name)).pack()
tk.Button(frame,text=name+'人物统计',command=lambda :self.statistics_button(name)).pack()
tk.Button(frame,text=name+'人物关系',command=lambda :self.relations_button(name)).pack()
tk.Button(frame,text='返回',command=lambda :self.return_button(name)).pack()
self.root.mainloop()
def wordcloud_button(self,name):
filepath='.\文本汇总\{}.txt'.format(name)
needname_path='.\人物汇总\{}人物.txt'.format(name)
sanguo=TextAnalysisAPP(filepath,needname_path)
sanguo.wordcloud_pc()
def statistics_button(self,name):
filepath='.\文本汇总\{}.txt'.format(name)
needname_path='.\人物汇总\{}人物.txt'.format(name)
sanguo=TextAnalysisAPP(filepath,needname_path)
sanguo.wordFreq() #返回词频文件名
sanguo.WordFreqAnalysis() #词频分析,返回文件名
def relations_button(self,name):
filepath='.\文本汇总\{}.txt'.format(name)
needname_path='.\人物汇总\{}人物.txt'.format(name)
sanguo=TextAnalysisAPP(filepath,needname_path)
sanguo.CharacterRelation()
def return_button(self,name):
self.root.destroy()
self.operate()
def main():
# filepath=’.\文本汇总\红楼梦.txt’
# needname_path=’.\人物汇总\红楼梦人物.txt’
# sanguo=TextAnalysisAPP(filepath,needname_path)
# ‘’‘词云显示’’’
# sanguo.wordcloud_pc()
# ‘’‘词频统计分析’’’
# sanguo.wordFreq() #返回词频文件名
# sanguo.WordFreqAnalysis() #词频分析,返回文件名
# print(sanguo.wordFreq_fpath,sanguo.wordFreq_pc_path,sanguo.wordFreq_statistics_path)
# ‘’‘人物关系分析’’’
# sanguo.CharacterRelation()
# Names=sanguo.needwordslist() #需要的名字列表
# print(Names)
# root =tk.Tk()
# root.title(“GUI的测试窗口”)
# root.geometry(“300x400+150+0”)
gui=GUIAPP()
gui.operate()
if name==‘main’:
main()