基于Python《红楼梦》文本分析可视化

最新推荐文章于 2024-09-06 21:00:00 发布

CSDN_kada

最新推荐文章于 2024-09-06 21:00:00 发布

阅读量334

点赞数 10

文章标签：信息可视化 python

本文链接：https://blog.csdn.net/CSDN_kada/article/details/139812052

版权

数据

《红楼梦》txt文本文件

人物出场频次

1.统计人物出场频次

import jieba as j
import matplotlib
from matplotlib import pyplot as plt
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
# 人物名称列表
names = ['贾母', '贾珍', '贾蓉', '贾赦', '贾政', '贾琏', '袭人', '王熙凤', '紫鹃', '翠缕', '香菱', '豆官', '薛蝌',
         '薛蟠','贾宝玉', '林黛玉', '平儿', '薛宝钗', '晴雯', '林之孝']
txt = open('红楼梦.txt','r',encoding='utf-8').read()
words = j.lcut(txt)  # 进行分词，并返回一个包含所有分词结果的列表
counts = {}
for word in words:
    # 忽略长度为1的词（可能是单个字符或标点）。
    if len(word)==1:
        continue
    # 如果词是人物名称的别名或昵称，则将其转换为标准名称
    elif word in ['老太太','老祖宗','史太君','贾母']:
        word = "贾母"
    elif word in ['贾珍','珍哥儿','大哥哥']:
        word =  "贾珍"
    elif word in ['老爷','贾政']:
        word = "贾政"
    elif word in ['贾宝玉','宝玉','宝二爷']:
        word =  '贾宝玉'
    elif word in ['风辣子','王熙凤','熙凤']:
        word =  "王熙凤"
    elif word in ['紫鹃','鹦哥']:
        word = "紫鹃"
    # 检查转换后的word是否在names列表中，并`counts`字典中增加相应的计数。
    if word in names:
        counts[word] = counts.get(word, 0) + 1

# 排序并打印结果
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
for i in range(len(items)):
    word, count = items[i]
    print('{0:<10}{1:>5}'.format(word, count))

2.词云图绘制

# 人物出场的频次
import jieba as j
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
# 人物名称列表
names = ['贾母', '贾珍', '贾蓉', '贾赦', '贾政','贾琏', '袭人', '王熙凤', '紫鹃', '翠缕', '香菱', '豆官', '薛蝌', '薛蟠',
         '贾宝玉', '林黛玉', '平儿', '薛宝钗', '晴雯', '林之孝']

# 转换列表以便统一名称
name_map = {
    "老太太": "贾母",
    "老祖宗": "贾母",
    "史太君": "贾母",
    "珍哥儿": "贾珍",
    "大哥哥": "贾珍",
    "老爷": "贾政",
    "宝二爷": "贾宝玉",
    "宝玉": "贾宝玉",
    "熙风": "王熙凤",
    "风辣子": "王熙凤",
    "鹦哥": "紫鹃"
}

# 读取文本
txt = open('红楼梦.txt', 'r', encoding='utf-8').read()

# 分词和计数
words = j.lcut(txt)
word_counts = Counter()
for word in words:
    if len(word) == 1 or not any(char.isalpha() for char in word):
        continue  # 忽略单字和非字母字符
    word = name_map.get(word, word)
    if word in names:
        word_counts[word] += 1

# 将Counter对象转换为字典
word_dict = dict(word_counts)

# 创建一个词云对象
wc = WordCloud(font_path='C:\\Windows\\Fonts\\STXINWEI.TTF',  # 指定字体文件路径，确保支持中文
               scale=32,
               background_color='white',  # 背景颜色
               max_words=20,  # 最大显示的词数
               max_font_size=100,  # 字体最大值
               min_font_size=10,   # 字体最小值
               width=250,  # 词云图的宽度
               height=150,  # 词云图的高度
               colormap='viridis'# 使用不同的颜色映射
)

# 生成词云
wc.generate_from_frequencies(word_dict)
# 显示词云图
plt.figure(figsize=(10, 5))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.title('红楼梦人物词云', fontsize=20)  # 添加标题
plt.show()

3.柱状图绘制

# 绘制柱状图
names_list = list(counts.keys())
counts_list = list(counts.values())

# 柱状图设置阈值，只显示出现次数超过某个值的人物
threshold = 5  # 例如，只显示出现次数超过5次的人物
filtered_names = [name for name, count in counts.items() if count >= threshold]
filtered_counts = [counts[name] for name in filtered_names]

# 创建一个颜色列表，确保它的长度与filtered_names相同
colors = ['skyblue', 'lightcoral', 'lightgreen', 'yellowgreen', 'orchid', 'violet']
num_colors = len(colors)
colors = [colors[i % num_colors] for i in range(len(filtered_names))]

plt.figure(figsize=(12, 8))  # 设置图形大小以适应更多的人物名称
plt.bar(filtered_names, filtered_counts, color=colors)  # 使用plt.bar()一次绘制所有柱状图
plt.title('红楼梦人物出场次数统计')
plt.xlabel('人物名称')
plt.ylabel('出场次数')
plt.xticks(rotation=45)  # 旋转x轴标签以便更好地显示
plt.tight_layout()  # 确保标签不会与图形重叠
plt.show()

章节字数统计

1.章节字数统计（保存为json文件）

import json
import re

# 读取文本
text = open('红楼梦.txt', 'r', encoding='utf-8').read()
# 查找章节标题
# 正则表达式判断字符串中是否包含中文
# “\u4e00”和“\u9fa5”是unicode编码，并且是中文编码开始和结束的两个值
titles = re.findall('第[\u4e00-\u9fa5]+回', text)
# 定义章节列表
chapters = []

for title in titles:
    # 去除重复的章节
    # 章节标题长度小于5，放置含有“回”的句子误判为标题
    if title not in chapters and len(title) <= 5:
        chapters.append(title)
# print(chapters)

# 找出每一回在原文中的起始位置
chapter_start = []
for title in chapters:
    chapter_start.append(text.index(title))
# 找出每一回在原文中的结束位置
chapter_end = chapter_start[1:] + [len(text)]
# zip将每一回的起始和结束位置拼成一个元组，存放在列表中。
chapterIndex = list(zip(chapter_start, chapter_end))
# print(chapterIndex)

# 统计每一章节的具体字数，并以“第x回：xx”输出
# 创建一个空列表来存储章节长度
chapter_lengths = []
for i, (start, end) in enumerate(zip(chapter_start, chapter_end), start=1):
    chapter_text = text[start:end]
    chapter_length = len(chapter_text.strip())  # 使用strip()去除章节文本前后的空白字符
    # 添加到列表中
    chapter_lengths.append({'chapter': '第{i}回', 'length': chapter_length})

# 将章节长度写入JSON文件
with open('chapter_lengths.json', 'w', encoding='utf-8') as file:
    json.dump(chapter_lengths, file, ensure_ascii=False, indent=4)

2.柱状图绘制

# 章回处理
import re
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
# 读取文本
text = open('红楼梦.txt', 'r', encoding='utf-8').read()
# 查找章节标题
# 正则表达式判断字符串中是否包含中文
# “\u4e00”和“\u9fa5”是unicode编码，并且是中文编码开始和结束的两个值
titles = re.findall('第[\u4e00-\u9fa5]+回', text)
# 定义章节列表
chapters = []

for title in titles:
    # 去除重复的章节
    # 章节标题长度小于5，放置含有“回”的句子误判为标题
    if title not in chapters and len(title) <= 5:
        chapters.append(title)
# print(chapters)

# 找出每一回在原文中的起始位置
chapter_start = []
for title in chapters:
    chapter_start.append(text.index(title))
# 找出每一回在原文中的结束位置
chapter_end = chapter_start[1:] + [len(text)]
# zip将每一回的起始和结束位置拼成一个元组，存放在列表中。
chapterIndex = list(zip(chapter_start, chapter_end))
# print(chapterIndex)

# 初始化章节标题和字数列表
chapter_titles = []
word_counts = []

# 遍历chapterIndex来获取章节的起始和结束位置，并统计字数
for start, end in chapterIndex:
    # 子串存储当前章节的文本内容。
    chapter_text = text[start:end]
    # 去除换行和空格后的字数
    word_count = len(chapter_text.replace('\n', '').replace(' ', ''))
    # 假设titles列表和chapterIndex列表的长度相同，并且一一对应
    # 每个(start, end)都是唯一的，表示不同的章节位置
    chapter_title = titles[chapterIndex.index((start, end))]
    chapter_titles.append(chapter_title)
    word_counts.append(word_count)

# 使用matplotlib的颜色映射
cmap = plt.get_cmap('viridis')
# 绘制柱状图，并使用颜色映射
plt.figure(figsize=(12, 6))  # 设置图形大小
bars = plt.bar(chapter_titles, word_counts, align='center', color=cmap(np.linspace(0, 1, len(chapter_titles))))
# 设置标题和轴标签
plt.title('红楼梦各章节字数统计')
plt.xlabel('章节')
plt.ylabel('字数')
# 旋转x轴标签以便阅读
plt.xticks(rotation=90)
# 自动调整子图参数，使之填充整个图像区域
plt.tight_layout()
# 显示图形
plt.show()

3.章节字数与章节段落关系

cnt_chap = []  # 存放每一回的段落数
cnt_word = []  # 存放每一回的字符总数
for i in range(120):
    start = chapterIndex[i][0]
    end = chapterIndex[i][1]
    cnt_chap.append(text[start:end].count('\n'))
    cnt_word.append(len(text[start:end]))

# 创建一个新的图形，并设置其大小
plt.figure(figsize=(15, 8))
# 绘制散点图，并改变点的颜色、大小和透明度
plt.scatter(cnt_chap, cnt_word, color='blue', s=50, alpha=0.8)
# 为每个点添加文本标签，确保标签位置合理
for i in range(len(cnt_chap)):
    # 调整这里的 x, y 偏移量
    plt.text(cnt_chap[i] + 0.5, cnt_word[i] + 10, chapters[i], size=7, ha='left', va='bottom')
# 添加网格线
plt.grid(True, alpha=0.5)
# 设置 x 轴和 y 轴的标签及标题，并调整字体大小
plt.xlabel('章节段数', fontdict={'fontsize': 14})
plt.ylabel('章节字数', fontdict={'fontsize': 14})
plt.title('《红楼梦》章节段数与字数关系散点图', fontdict={'fontsize': 16})
# 显示图形
plt.show()

人物关系映射

import networkx as nx
from matplotlib import pyplot as plt
matplotlib.rcParams['font.sans-serif'] = ['SimHei']
# 读取文本
text = open('红楼梦.txt', 'r', encoding='utf-8').read()

Names=['宝玉','凤姐','贾母','黛玉','王夫人','老太太','袭人','贾琏','平儿','宝钗','薛姨妈','探春','鸳鸯','贾政','晴雯','湘云','刘姥姥','邢夫人','贾珍','紫鹃','香菱','尤氏','薛蟠','贾赦']
relations = {}

# 按段落划分，假设在同一段落中出现的人物具有的关系
# 将文本按`\n`分割成多个段落，每个段落作为列表`lst_para`中的一个元素。
lst_para = text.split('\n')
# 通过两个嵌套的循环，检查每个段落中是否包含`Names`列表中的任何两个人物`name1`和`name2`。
for t in lst_para:
    for name1 in Names:
        # * 如果`name1`和`name2`都在同一个段落中出现。
        if name1 in t:
            for name2 in Names:
                # 不是同一人物，且这对人物在`relations`字典中还没有记录
                if name2 in t and name1 != name2 and (name2,name1) not in relations:
                    # 在`relations`字典中增加这对人物的共现次数。
                    relations[(name1,name2)] = relations.get((name1,name2),0)+1
# print(relations.items())

# 权重标准化
# 找到`relations`字典中所有值（即共现次数）的最大值`maxRela`
maxRela = max([v for k,v in relations.items()])
# 归一化到0到1之间。让后续的可视化更容易处理不同的权重级别。
relations = {k:v/maxRela for k,v in relations.items()}
# print(relations)

# 图形的构建和可视化
plt.figure(figsize=(12,12))
G= nx.Graph()
# 根据relations的数据向G中添加边
# 每条边的权重（`weight`）设置为归一化后的共现次数。
for k,v in relations.items():
    G.add_edge(k[0],k[1],weight=v)
# 边的分类
elarge = [(u,v)for (u,v,d) in G.edges(data=True) if d['weight']>0.6]
emidle = [(u,v)for (u,v,d) in G.edges(data=True) if (d['weight']>0.3)&(d['weight']<=0.6)]
esmall = [(u,v)for (u,v,d) in G.edges(data=True) if d['weight']<=0.3]

# 基于节点的邻居边的权重总和来设置节点的颜色。反应人物的社交活跃度。
# 计算每个节点的权重总和（基于连接的边）
node_weights = {node: sum(relations.get((node, nbr), 0) for nbr in G.neighbors(node)) for node in G.nodes()}
# 标准化节点权重到0-1范围
max_node_weight = max(node_weights.values())
node_weights = {node: weight / max_node_weight for node, weight in node_weights.items()}

# 设置图形布局
pos = nx.spring_layout(G)
# 设置颜色映射
cmap = plt.get_cmap('viridis')
# 绘制图形
plt.figure(figsize=(12, 12))
pos = nx.spring_layout(G)
# 设置节点样式，根据节点权重设置颜色
node_colors = [cmap(weight) for _, weight in node_weights.items()]
nx.draw_networkx_nodes(G, pos, alpha=0.8, node_size=800, node_color=node_colors)
# 绘制边，设置颜色和宽度。为不同的边分类设置颜色
nx.draw_networkx_edges(G,pos,edgelist=elarge,width=2.5,alpha=0.9,edge_color='g')
nx.draw_networkx_edges(G,pos,edgelist=emidle,width=1.5,alpha=0.6,edge_color='r')
nx.draw_networkx_edges(G,pos,edgelist=esmall,width=1,alpha=0.4,edge_color='b',style='dashed')
# 绘制节点标签
nx.draw_networkx_labels(G, pos, font_size=12)
# 移除坐标轴
plt.axis('off')
# 添加标题
plt.title('《红楼梦》主要人物社交关系网络图')
# 显示图形
plt.show()

情绪变化图

import pyecharts.options as opts
from pyecharts.charts import Line
import re
# 读取文本
text = open('红楼梦.txt', 'r', encoding='utf-8').read()
# 查找章节标题
# 正则表达式判断字符串中是否包含中文
# “\u4e00”和“\u9fa5”是unicode编码，并且是中文编码开始和结束的两个值
titles = re.findall('第[\u4e00-\u9fa5]+回', text)
# 定义章节列表
chapters = []

for title in titles:
    # 去除重复的章节
    # 章节标题长度小于5，放置含有“回”的句子误判为标题
    if title not in chapters and len(title) <= 5:
        chapters.append(title)
# print(chapters)

# 找出每一回在原文中的起始位置
chapter_start = []
for title in chapters:
    chapter_start.append(text.index(title))
# 找出每一回在原文中的结束位置
chapter_end = chapter_start[1:] + [len(text)]
# zip将每一回的起始和结束位置拼成一个元组，存放在列表中。
chapterIndex = list(zip(chapter_start, chapter_end))
# print(chapterIndex)

# 统计情绪次数
# 初始化列表
cnt_laugh = []
cnt_cry = []
# 遍历章节索引
for i in range(120):
    # 获取章节的起始和结束位置
    start = chapterIndex[i][0]
    end = chapterIndex[i][1]
    # 统计每个章节中情绪词的出现次数
    cnt_laugh.append(text[start:end].count('笑')+text[start:end].count('喜'))
    cnt_cry.append(text[start:end].count('哭')+text[start:end].count('悲'))

# 创建一个Line对象
line = (
    Line()
    .add_xaxis(range(120))  # 使用章节索引作为x轴数据
    .add_yaxis("笑与喜", cnt_laugh, areastyle_opts=opts.AreaStyleOpts(opacity=0.5))  # 使用cnt_laugh作为y轴数据
    .add_yaxis("哭与悲", cnt_cry, areastyle_opts=opts.AreaStyleOpts(opacity=0.5))  # 使用cnt_cry作为y轴数据
    .set_global_opts(
        title_opts=opts.TitleOpts(title="《红楼梦》情绪变化折线图"),
        xaxis_opts=opts.AxisOpts(type_="category"),  # 设置为类别轴
        yaxis_opts=opts.AxisOpts(name="出现次数"),  # 设置y轴名称
    )
)

# 渲染图表到HTML文件
line.render("line_area_style.html")

贾宝玉前20回出场次数

import re
from pyecharts.charts import Sankey
from pyecharts import options as opts

# 读取《红楼梦》全文
with open('前20回.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# 正则表达式模式，用于匹配“宝玉”、“贾宝玉”和“宝二爷”
pattern = re.compile(r'宝玉|贾宝玉|宝二爷')
# 章节分隔符的正则表达式（匹配“第X回”及其后的内容，直到换行符或章节结束）
chapter_pattern = re.compile(r'第[\u4e00-\u9fa5]+回.*?(?=第[\u4e00-\u9fa5]+回|\Z)', re.DOTALL)

# 初始化一个字典来存储贾宝玉在各章节的出现次数
jia_baoyu_counts = {}

# 查找所有章节
chapters_with_titles = chapter_pattern.findall(text)

# 遍历每个章节（包含标题和内容）
for chapter_with_title in chapters_with_titles:
    # 分离章节标题和内容（这里假设标题和内容由换行符分隔）
    chapter_title, chapter_content = chapter_with_title.split('\n', 1)
    chapter_title = chapter_title.strip()  # 去除标题前后的空白字符
    chapter_content = chapter_content.strip()  # 去除内容前后的空白字符
    # 搜索“贾宝玉”并计数
    count = len(pattern.findall(chapter_content))
    # 将计数存储到字典中
    jia_baoyu_counts[chapter_title] = count

# # 打印结果
# for chapter, count in jia_baoyu_counts.items():
#     print(f"{chapter}: 贾宝玉出现了 {count} 次")

# 创建一个“总计”节点
total_node = {"name": "总计"}
# 初始化节点列表，包含所有章节和“总计”节点
nodes = [{"name": chapter_title} for chapter_title in jia_baoyu_counts.keys()] + [total_node]
# 初始化链接列表
links = []
# 遍历章节字典，为每个章节创建一个到“总计”的链接
for chapter_title, count in jia_baoyu_counts.items():
    links.append({"source": chapter_title, "target": "总计", "value": count})

# 初始化桑基图对象
sankey = Sankey()
# 添加节点和链接到桑基图
sankey.add(
    "出场次数",
    nodes,
    links,
    linestyle_opt=opts.LineStyleOpts(opacity=0.2, curve=0.5, color="source"),
    label_opts=opts.LabelOpts(position="right"),
)

sankey.set_global_opts(
    title_opts=opts.TitleOpts(
        title="贾宝玉前20回出场次数",  # 标题文本
        title_textstyle_opts=opts.TextStyleOpts(font_size=15) # 标题字体大小，可以根据需要调整
    )
)
# 设置节点样式（可选）
sankey.set_series_opts(
    label_opts=opts.LabelOpts(is_show=True, position="right"),
)
# 渲染图表到HTML文件
sankey.render("jia_baoyu_sankey.html")