Python爬取金庸人物

Step:

  • 目标文章:鹿鼎记

  • 实现功能:

  • 人物统计

  • 云图

  • 程序源码:

-- coding: utf-8 --

"""
Created on Sat Jul  7 16:57:02 2018

@author: fslq
"""
#初步获取文本
import os
import os.path
import codecs
import jieba
import numpy
import pandas
#获取文本内容
fileContents=[]  
segments=[]   
def getContent(filespath,stonampath):#  filespath:源文件夹路径
    for root,dirs,files in os.walk(filespath):          
        for name in files:                             
            f=codecs.open(os.path.join(root,name),'r','utf-8')
            fileContent=f.read()
            f.close()
            fileContents.append(fileContent)
    corpos=pandas.DataFrame({'fileContent':fileContents})
                        
#分解文章    
    jieba.load_userdict(stonampath) 
    seg_list=jieba.cut(fileContent,cut_all=False)
    for index,row in corpos.iterrows():
        fileContent=row['fileContent']
        for w in seg_list:
            segments.append(w)  

def inputTxt(stonampath,stoworpath): 
    segmentDataFrame=pandas.DataFrame({'stopword':segments})
    #分词计数
    segStat=segmentDataFrame.groupby(by='stopword')['stopword'].agg({'计数':numpy.size}).reset_index().sort_values(by='计数',ascending=False)
    stopwords=pandas.read_csv(stonampath,encoding='utf-8',index_col=False,engine='python')
    #获取每篇文章的停用词表
    fSegStat=segStat[~segStat.stopword.isin(stopwords.stopword)]
    fSegStat.stopword.to_csv(stoworpath, header=True,index=False, sep='\t')
#得到人物出现次数数据
def funTxt(stoworpath,namnumpath): #namnumpath:人物名计数 stoworpath:停用词表(表列名为stopword)
    #过滤人物名  
    segmentDataFrame=pandas.DataFrame({'segment':segments})
    #分词计数
    segStat=segmentDataFrame.groupby(by='segment')['segment'].agg({'计数':numpy.size}).reset_index().sort_values(by='计数',ascending=False)
    #获取每篇文章的停用词表
    stopwords=pandas.read_csv(stoworpath,encoding='utf-8',index_col=False,engine='python')
    fSegStat=segStat[~segStat.segment.isin(stopwords.stopword)]
    fSegStat.to_csv(namnumpath, header=True,index=False, sep='\t')
    #删除不必要的字符串                                    
    with open(namnumpath,'r',encoding='utf8') as r:
        lines=r.readlines()
    with open(namnumpath,'w',encoding='utf8') as w:
        for l in lines:
            if '"' in l:
                pass
            elif ',' in l:
                pass
            elif '\u3000' in l:
                pass
            elif '\n' is l:
                pass
            elif ' ' in l:
                pass
            else:
                w.write(l)       
#绘图
def draws(namnumpath):
    from matplotlib.font_manager import FontProperties
    from wordcloud import WordCloud
    import matplotlib.pyplot as plt
    from wordcloud import WordCloud,ImageColorGenerator
    from scipy.misc import imread
    font = FontProperties(fname=r'C:\Windows\Fonts\STXINGKA.TTF', size=12)
    self_img = imread(r'C:\Users\fslq\Desktop\PythonFile\timg.png')
    fSegStat=pandas.read_csv(namnumpath, encoding='utf-8' ,sep='\t',engine='python')
    wordcloud=WordCloud(
        font_path=r'C:\Windows\Fonts\STXINGKA.TTF',
        background_color='white',
        mask=self_img,
        width=1500,
        height=1500
    )
    #color_mask = imread(r'C:\Users\fslq\Desktop\PythonFile\timg.png')
    words=fSegStat.set_index('segment').to_dict()
    wordcloud.fit_words(words['计数'])
    plt.title(namnumpath[44:52], fontproperties=font)
    plt.axis("off")
    plt.imshow(wordcloud)
    
    loandata=fSegStat.head(15)
    fd=loandata.set_index('segment')
    fd[::-1].plot(kind='barh',rot=0).set_yticklabels(loandata.segment[::-1], fontproperties=font)
    plt.legend(prop=font)
    plt.show()
     
  • 主程序:
#1鹿鼎记
if __name__=='__main__':
    stoworpath=r'C:\Users\fslq\Desktop\PythonFile\stoworpath\停用表.txt'
    stonampath=r'C:\Users\fslq\Desktop\PythonFile\stonampath\人物名称-停用词.txt'
    namnumpath=r'C:\Users\fslq\Desktop\PythonFile\namnumpath\分析金庸人物计数.txt'
    
    filespath=r'C:\Users\fslq\Desktop\PythonFile\filespath'#需更改文章
    getContent(filespath,stonampath)
    inputTxt(stonampath,stoworpath)
    funTxt(stoworpath,namnumpath)
    draws(namnumpath)
  • 执行结果:
    在这里插入图片描述在这里插入图片描述
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值