python实现对小说的文本分析（人物关系图，人物词云等）

勒布朗展

已于 2022-01-25 20:58:21 修改

阅读量1.2w

点赞数 21

文章标签： python mooc visual studio

于 2021-02-14 11:19:16 首次发布

本文链接：https://blog.csdn.net/youyang_wang/article/details/113806532

版权

文本分析

0.效果图

在这里插入图片描述
这里有些“二人”、“不会”等词语没用筛选掉，手动筛选即可。

1.整体思路：

调用的库：jieba,matplotlib,networkx,wordcloud
分析的文本：三联版连城诀
需要的工具：Python，小说文本，中文停词文档。

2.具体实现：

1.读取文本:

def read_txt():
    file=open('连城诀【三联版】.txt','r+',encoding='gbk')
    txt=file.read()
    file.close()
    return txt

2.词性统计（写入文档）：

def sda():
    import jieba.posseg as psg
    text=open("连城诀【三联版】.txt", encoding='gbk', errors='ignore').read() 
    seg=psg.cut(text) 
    file=open("词性.txt",'a+')
    for ele in seg:
        file.writelines(ele)

3.导入停词文档

def stopwordslist(filepath):
    stopwords=[line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()]
    return stopwords

4.分词生成人物（写入文档）

def write_txt():
    words = jieba.lcut(read_txt())     # 使用精确模式对文本进行分词counts = {}     # 通过键值对的形式存储词语及其出现的次数
    counts={}
    stopwords=stopwordslist('stop.txt')
    for word in words:
        if len(word) == 1:    # 单个词语不计算在内
            continue
        elif word not in stopwords:
            counts[word] = counts.get(word, 0) + 1    # 遍历所有词语，每出现一次其对应的值加 1
    items = list(counts.items())
    items.sort(key=lambda x: x[1], reverse=True)    # 根据词语出现的次数进行从大到小排序

    f=open("词频统计.txt","w")#写入文件
    for i in range(len(items)):
        word, count = items[i]
        f.writelines("{0:<5}{1:>5}\n".format(word, count))
    f.close()

5.生成词云：

def creat_wordcloud():
    f_0=open("词频统计.txt",'r')
    bg_pic=plt.imread('张国荣.jpg')
    text=f_0.read()
    f_0.close()
    wcloud=wordcloud.WordCloud(font_path=r"C:\Windows\Fonts\simhei.ttf",
                           background_color="white",width=1000,
                           max_words=500,
                           mask=bg_pic,
                           height=860,
                           margin=2,
                           ).generate(text)

    wcloud.to_file("连城诀cloud.jpg")
    plt.imshow(wcloud)
    plt.axis('off')
    plt.show()

6.生成人物关系图：

def creat_relationship():
    Names=['狄云', '水笙', '万震山', '丁典', ' 戚芳', ' 万圭 ', '花铁干' ,' 血刀老祖 ', '戚长发', ' 言达平' , '宝象',' 汪啸风' ,'水岱']
    relations={}
    lst_para=(read_txt()).split('\n')#lst_para是每一段
    for text in lst_para:
        for name_0 in Names:
            if name_0 in text:
                for name_1 in Names:
                    if name_1 in text and name_0!=name_1 and (name_1,name_0) not in relations:
                        relations[(name_0,name_1)]=relations.get((name_0,name_1),0)+1
    maxRela=max([v for k,v in relations.items()])
    relations={k:v /  maxRela for k,v in relations.items()}
    #return relations


    plt.figure(figsize=(15,15))
    G=nx.Graph()
    for k,v in relations.items():
        G.add_edge(k[0],k[1],weight=v)
        #筛选权重大于0.6的边
    elarge=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight']>0.6]
    #筛选权重大于0.3小于0.6的边
    emidle=[(u,v) for (u,v,d) in G.edges(data=True) if (d['weight']>0.3) & (d['weight']<=0.6)]
    #筛选权重小于0.3的边
    esmall=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight']<=0.3]
    #设置图形布局
    pos=nx.spring_layout(G)
    #设置节点样式
    nx.draw_networkx_nodes(G,pos,alpha=0.8, node_size=1200)
    #设置大于0.6的边的样式
    nx.draw_networkx_edges(G,pos,edgelist=elarge, width=2.5,alpha=0.9,edge_color='g')
    #0.3~0.6
    nx.draw_networkx_edges(G,pos,edgelist=emidle, width=1.5,alpha=0.6,edge_color='y')
    #<0.3
    nx.draw_networkx_edges(G,pos,edgelist=esmall, width=1,alpha=0.4,edge_color='b',style='dashed')
    nx.draw_networkx_labels(G,pos,font_size=12)

    plt.axis('off')
    plt.title("连城诀人物权重图")
    plt.show()

完整代码：

import jieba
import matplotlib.pyplot as plt
import wordcloud
import networkx as nx
import matplotlib
import jieba.posseg as psg
matplotlib.rcParams['font.sans-serif']=['SimHei']
#读取文本
def read_txt():
    file=open('连城诀【三联版】.txt','r+',encoding='gbk')
    txt=file.read()
    file.close()
    return txt

#词性统计（写入文档）
def sda():
    import jieba.posseg as psg
    text=open("连城诀【三联版】.txt", encoding='gbk', errors='ignore').read() 
    seg=psg.cut(text) 
    file=open("词性.txt",'a+')
    for ele in seg:
        file.writelines(ele)
  
#停词文档
def stopwordslist(filepath):
    stopwords=[line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()]
    return stopwords

#分词生成人物（写入文档）
def write_txt():
    words = jieba.lcut(read_txt())     # 使用精确模式对文本进行分词counts = {}     # 通过键值对的形式存储词语及其出现的次数
    counts={}
    stopwords=stopwordslist('stop.txt')
    for word in words:
        if len(word) == 1:    # 单个词语不计算在内
            continue
        elif word not in stopwords:
            counts[word] = counts.get(word, 0) + 1    # 遍历所有词语，每出现一次其对应的值加 1
    items = list(counts.items())
    items.sort(key=lambda x: x[1], reverse=True)    # 根据词语出现的次数进行从大到小排序

    f=open("词频统计.txt","w")#写入文件
    for i in range(len(items)):
        word, count = items[i]
        f.writelines("{0:<5}{1:>5}\n".format(word, count))
    f.close()

#生成词云
def creat_wordcloud():
    f_0=open("词频统计.txt",'r')
    bg_pic=plt.imread('张国荣.jpg')
    text=f_0.read()
    f_0.close()
    wcloud=wordcloud.WordCloud(font_path=r"C:\Windows\Fonts\simhei.ttf",
                           background_color="white",width=1000,
                           max_words=500,
                           mask=bg_pic,
                           height=860,
                           margin=2,
                           ).generate(text)

    wcloud.to_file("连城诀cloud.jpg")
    plt.imshow(wcloud)
    plt.axis('off')
    plt.show()


#生成人物关系图（全按书上抄的）
def creat_relationship():
    Names=['狄云', '水笙', '万震山', '丁典', ' 戚芳', ' 万圭 ', '花铁干' ,' 血刀老祖 ', '戚长发', ' 言达平' , '宝象',' 汪啸风' ,'水岱']
    relations={}
    lst_para=(read_txt()).split('\n')#lst_para是每一段
    for text in lst_para:
        for name_0 in Names:
            if name_0 in text:
                for name_1 in Names:
                    if name_1 in text and name_0!=name_1 and (name_1,name_0) not in relations:
                        relations[(name_0,name_1)]=relations.get((name_0,name_1),0)+1
    maxRela=max([v for k,v in relations.items()])
    relations={k:v /  maxRela for k,v in relations.items()}
    #return relations


    plt.figure(figsize=(15,15))
    G=nx.Graph()
    for k,v in relations.items():
        G.add_edge(k[0],k[1],weight=v)
        #筛选权重大于0.6的边
    elarge=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight']>0.6]
    #筛选权重大于0.3小于0.6的边
    emidle=[(u,v) for (u,v,d) in G.edges(data=True) if (d['weight']>0.3) & (d['weight']<=0.6)]
    #筛选权重小于0.3的边
    esmall=[(u,v) for (u,v,d) in G.edges(data=True) if d['weight']<=0.3]
    #设置图形布局
    pos=nx.spring_layout(G)
    #设置节点样式
    nx.draw_networkx_nodes(G,pos,alpha=0.8, node_size=1200)
    #设置大于0.6的边的样式
    nx.draw_networkx_edges(G,pos,edgelist=elarge, width=2.5,alpha=0.9,edge_color='g')
    #0.3~0.6
    nx.draw_networkx_edges(G,pos,edgelist=emidle, width=1.5,alpha=0.6,edge_color='y')
    #<0.3
    nx.draw_networkx_edges(G,pos,edgelist=esmall, width=1,alpha=0.4,edge_color='b',style='dashed')
    nx.draw_networkx_labels(G,pos,font_size=12)

    plt.axis('off')
    plt.title("连城诀人物权重图")
    plt.show()

def main():
    write_txt()
    creat_wordcloud()
    creat_relationship()

if __name__ == '__main__':
    main()

勒布朗展

关注

21
点赞
踩
216

收藏

觉得还不错? 一键收藏
6
评论
python实现对小说的文本分析（人物关系图，人物词云等）

文本分析1.整体思路：调用的库：jieba,matplotlib,networkx,wordcloud分析的文本：三联版连城诀需要的工具：Python，小说文本，中文停词文档。2.具体实现：1.读取文本:def read_txt(): file=open('连城诀【三联版】.txt','r+',encoding='gbk') txt=file.read() file.close() return txt2.词性统计（写入文档）：def sda():
复制链接

扫一扫