Step:
-
目标文章:鹿鼎记
-
实现功能:
-
人物统计
-
云图
-
程序源码:
-- coding: utf-8 --
"""
Created on Sat Jul 7 16:57:02 2018
@author: fslq
"""
#初步获取文本
import os
import os.path
import codecs
import jieba
import numpy
import pandas
#获取文本内容
fileContents=[]
segments=[]
def getContent(filespath,stonampath):# filespath:源文件夹路径
for root,dirs,files in os.walk(filespath):
for name in files:
f=codecs.open(os.path.join(root,name),'r','utf-8')
fileContent=f.read()
f.close()
fileContents.append(fileContent)
corpos=pandas.DataFrame({'fileContent':fileContents})
#分解文章
jieba.load_userdict(stonampath)
seg_list=jieba.cut(fileContent,cut_all=False)
for index,row in corpos.iterrows():
fileContent=row['fileContent']
for w in seg_list:
segments.append(w)
def inputTxt(stonampath,stoworpath):
segmentDataFrame=pandas.DataFrame({'stopword':segments})
#分词计数
segStat=segmentDataFrame.groupby(by='stopword')['stopword'].agg({'计数':numpy.size}).reset_index().sort_values(by='计数',ascending=False)
stopwords=pandas.read_csv(stonampath,encoding='utf-8',index_col=False,engine='python')
#获取每篇文章的停用词表
fSegStat=segStat[~segStat.stopword.isin(stopwords.stopword)]
fSegStat.stopword.to_csv(stoworpath, header=True,index=False, sep='\t')
#得到人物出现次数数据
def funTxt(stoworpath,namnumpath): #namnumpath:人物名计数 stoworpath:停用词表(表列名为stopword)
#过滤人物名
segmentDataFrame=pandas.DataFrame({'segment':segments})
#分词计数
segStat=segmentDataFrame.groupby(by='segment')['segment'].agg({'计数':numpy.size}).reset_index().sort_values(by='计数',ascending=False)
#获取每篇文章的停用词表
stopwords=pandas.read_csv(stoworpath,encoding='utf-8',index_col=False,engine='python')
fSegStat=segStat[~segStat.segment.isin(stopwords.stopword)]
fSegStat.to_csv(namnumpath, header=True,index=False, sep='\t')
#删除不必要的字符串
with open(namnumpath,'r',encoding='utf8') as r:
lines=r.readlines()
with open(namnumpath,'w',encoding='utf8') as w:
for l in lines:
if '"' in l:
pass
elif ',' in l:
pass
elif '\u3000' in l:
pass
elif '\n' is l:
pass
elif ' ' in l:
pass
else:
w.write(l)
#绘图
def draws(namnumpath):
from matplotlib.font_manager import FontProperties
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from wordcloud import WordCloud,ImageColorGenerator
from scipy.misc import imread
font = FontProperties(fname=r'C:\Windows\Fonts\STXINGKA.TTF', size=12)
self_img = imread(r'C:\Users\fslq\Desktop\PythonFile\timg.png')
fSegStat=pandas.read_csv(namnumpath, encoding='utf-8' ,sep='\t',engine='python')
wordcloud=WordCloud(
font_path=r'C:\Windows\Fonts\STXINGKA.TTF',
background_color='white',
mask=self_img,
width=1500,
height=1500
)
#color_mask = imread(r'C:\Users\fslq\Desktop\PythonFile\timg.png')
words=fSegStat.set_index('segment').to_dict()
wordcloud.fit_words(words['计数'])
plt.title(namnumpath[44:52], fontproperties=font)
plt.axis("off")
plt.imshow(wordcloud)
loandata=fSegStat.head(15)
fd=loandata.set_index('segment')
fd[::-1].plot(kind='barh',rot=0).set_yticklabels(loandata.segment[::-1], fontproperties=font)
plt.legend(prop=font)
plt.show()
- 主程序:
#1鹿鼎记
if __name__=='__main__':
stoworpath=r'C:\Users\fslq\Desktop\PythonFile\stoworpath\停用表.txt'
stonampath=r'C:\Users\fslq\Desktop\PythonFile\stonampath\人物名称-停用词.txt'
namnumpath=r'C:\Users\fslq\Desktop\PythonFile\namnumpath\分析金庸人物计数.txt'
filespath=r'C:\Users\fslq\Desktop\PythonFile\filespath'#需更改文章
getContent(filespath,stonampath)
inputTxt(stonampath,stoworpath)
funTxt(stoworpath,namnumpath)
draws(namnumpath)
- 执行结果: