python实现对小说的文本分析(人物关系图,人物词云等)

文本分析

0.效果图

在这里插入图片描述
这里有些“二人”、“不会”等词语没用筛选掉,手动筛选即可。
在这里插入图片描述

1.整体思路:

  1. 调用的库:jieba,matplotlib,networkx,wordcloud
  2. 分析的文本:三联版连城诀
  3. 需要的工具:Python,小说文本,中文停词文档。

2.具体实现:

1.读取文本:

def read_txt():
    file=open('连城诀【三联版】.txt','r+',encoding='gbk')
    txt=file.read()
    file.close()
    return txt

2.词性统计(写入文档):

def sda():
    import jieba.posseg as psg
    text=open("连城诀【三联版】.txt", encoding='gbk', errors='ignore').read() 
    seg=psg.cut(text) 
    file=open("词性.txt",'a+')
    for ele in seg:
        file.writelines(ele)

3.导入停词文档

def stopwordslist(filepath):
    stopwords=[line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()]
    return stopwords

4.分词生成人物(写入文档)

def write_txt():
    words = jieba.lcut(read_txt())     # 使用精确模式对文本进行分词counts = {}     # 通过键值对的形式存储词语及其出现的次数
    counts={}
    stopwords=stopwordslist('stop.txt')
    for word in words:
        if len(word) == 1:    # 单个词语不计算在内
            continue
        elif word not in stopwords:
            counts[word] = counts.get(word, 0) + 1    # 遍历所有词语,每出现一次其对应的值加 1
    items = list(counts.items())
    items.sort(key=lambda x: x[1], reverse=True)    # 根据词语出现的次数进行从大到小排序

    f=open("词频统计.txt","w")#写入文件
    for i in range(len(items)):
        word, count = items[i]
        f.writelines("{0:<5}{1:>5}\n".format(word, count))
    f.close()

5.生成词云:

def creat_wordcloud():
    f_0=open("词频统计.txt",'r')
    bg_pic=plt.imread('张国荣.jpg')
    text=f_0.read()
    f_0.close()
    wcloud=wordcloud.WordCloud(font_path=r"C:\Windows\Fonts\simhei.ttf",
                           background_color="white",width=1000,
                           max_words=500,
                           mask=bg_pic,
                           height=860,
                           margin=2,
                           ).generate(text)

    wcloud.to_file("连城诀cloud.jpg")
    plt.imshow(wcloud)
    plt.axis('off')
    plt.show()

6.生成人物关系图:

def creat_relationship():
    Names=['狄云', '水笙', '万震山', '丁典', ' 戚芳', ' 万圭 ', '花铁干' ,' 血刀老祖 ', '戚长发', ' 言达平' , '宝象',' 汪啸风' ,'水岱']
    relations={}
    lst_para=(read_txt()).split('\n')#lst_para是每一段
    for text in lst_para:
        for name_0 in Names:
            if name_0 in text:
                for name_1 in Names:
                    if name_1 in text and name_0!=name_1 and (name_1,name_0) not in relations:
                        relations[(name_0,name_1)]=relations.get((name_0,name_1),0)+1
    maxRela=max([v for k,v in relations.items()])
    relations={k:v /  maxRela for k,v in relations.items()}
    #return relations


    plt.figure(figsize=(15,15))
    G=nx.Graph()
    for k,v in relations.items():
        G.add_edge(k[0],k[1],weight=v)
        #筛选权重大于0.6的边
    elarge=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight']>0.6]
    #筛选权重大于0.3小于0.6的边
    emidle=[(u,v) for (u,v,d) in G.edges(data=True) if (d['weight']>0.3) & (d['weight']<=0.6)]
    #筛选权重小于0.3的边
    esmall=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight']<=0.3]
    #设置图形布局
    pos=nx.spring_layout(G)
    #设置节点样式
    nx.draw_networkx_nodes(G,pos,alpha=0.8, node_size=1200)
    #设置大于0.6的边的样式
    nx.draw_networkx_edges(G,pos,edgelist=elarge, width=2.5,alpha=0.9,edge_color='g')
    #0.3~0.6
    nx.draw_networkx_edges(G,pos,edgelist=emidle, width=1.5,alpha=0.6,edge_color='y')
    #<0.3
    nx.draw_networkx_edges(G,pos,edgelist=esmall, width=1,alpha=0.4,edge_color='b',style='dashed')
    nx.draw_networkx_labels(G,pos,font_size=12)

    plt.axis('off')
    plt.title("连城诀人物权重图")
    plt.show()

完整代码:

import jieba
import matplotlib.pyplot as plt
import wordcloud
import networkx as nx
import matplotlib
import jieba.posseg as psg
matplotlib.rcParams['font.sans-serif']=['SimHei']
#读取文本
def read_txt():
    file=open('连城诀【三联版】.txt','r+',encoding='gbk')
    txt=file.read()
    file.close()
    return txt

#词性统计(写入文档)
def sda():
    import jieba.posseg as psg
    text=open("连城诀【三联版】.txt", encoding='gbk', errors='ignore').read() 
    seg=psg.cut(text) 
    file=open("词性.txt",'a+')
    for ele in seg:
        file.writelines(ele)
  
#停词文档
def stopwordslist(filepath):
    stopwords=[line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()]
    return stopwords

#分词生成人物(写入文档)
def write_txt():
    words = jieba.lcut(read_txt())     # 使用精确模式对文本进行分词counts = {}     # 通过键值对的形式存储词语及其出现的次数
    counts={}
    stopwords=stopwordslist('stop.txt')
    for word in words:
        if len(word) == 1:    # 单个词语不计算在内
            continue
        elif word not in stopwords:
            counts[word] = counts.get(word, 0) + 1    # 遍历所有词语,每出现一次其对应的值加 1
    items = list(counts.items())
    items.sort(key=lambda x: x[1], reverse=True)    # 根据词语出现的次数进行从大到小排序

    f=open("词频统计.txt","w")#写入文件
    for i in range(len(items)):
        word, count = items[i]
        f.writelines("{0:<5}{1:>5}\n".format(word, count))
    f.close()

#生成词云
def creat_wordcloud():
    f_0=open("词频统计.txt",'r')
    bg_pic=plt.imread('张国荣.jpg')
    text=f_0.read()
    f_0.close()
    wcloud=wordcloud.WordCloud(font_path=r"C:\Windows\Fonts\simhei.ttf",
                           background_color="white",width=1000,
                           max_words=500,
                           mask=bg_pic,
                           height=860,
                           margin=2,
                           ).generate(text)

    wcloud.to_file("连城诀cloud.jpg")
    plt.imshow(wcloud)
    plt.axis('off')
    plt.show()


#生成人物关系图(全按书上抄的)
def creat_relationship():
    Names=['狄云', '水笙', '万震山', '丁典', ' 戚芳', ' 万圭 ', '花铁干' ,' 血刀老祖 ', '戚长发', ' 言达平' , '宝象',' 汪啸风' ,'水岱']
    relations={}
    lst_para=(read_txt()).split('\n')#lst_para是每一段
    for text in lst_para:
        for name_0 in Names:
            if name_0 in text:
                for name_1 in Names:
                    if name_1 in text and name_0!=name_1 and (name_1,name_0) not in relations:
                        relations[(name_0,name_1)]=relations.get((name_0,name_1),0)+1
    maxRela=max([v for k,v in relations.items()])
    relations={k:v /  maxRela for k,v in relations.items()}
    #return relations


    plt.figure(figsize=(15,15))
    G=nx.Graph()
    for k,v in relations.items():
        G.add_edge(k[0],k[1],weight=v)
        #筛选权重大于0.6的边
    elarge=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight']>0.6]
    #筛选权重大于0.3小于0.6的边
    emidle=[(u,v) for (u,v,d) in G.edges(data=True) if (d['weight']>0.3) & (d['weight']<=0.6)]
    #筛选权重小于0.3的边
    esmall=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight']<=0.3]
    #设置图形布局
    pos=nx.spring_layout(G)
    #设置节点样式
    nx.draw_networkx_nodes(G,pos,alpha=0.8, node_size=1200)
    #设置大于0.6的边的样式
    nx.draw_networkx_edges(G,pos,edgelist=elarge, width=2.5,alpha=0.9,edge_color='g')
    #0.3~0.6
    nx.draw_networkx_edges(G,pos,edgelist=emidle, width=1.5,alpha=0.6,edge_color='y')
    #<0.3
    nx.draw_networkx_edges(G,pos,edgelist=esmall, width=1,alpha=0.4,edge_color='b',style='dashed')
    nx.draw_networkx_labels(G,pos,font_size=12)

    plt.axis('off')
    plt.title("连城诀人物权重图")
    plt.show()

def main():
    write_txt()
    creat_wordcloud()
    creat_relationship()

if __name__ == '__main__':
    main()
  • 21
    点赞
  • 216
    收藏
    觉得还不错? 一键收藏
  • 6
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值