Python实例题:Python基于共现提取《釜山行》人物关系-CSDN博客

本文链接：https://blog.csdn.net/shangzhiqi/article/details/148020193

Python实例题

题目

Python基于共现提取《釜山行》人物关系

实现思路

准备工作：获取电影《釜山行》的字幕文件，并定义主要人物列表。
字幕处理：读取字幕文件，提取文本内容并进行清理。
共现分析：遍历每一句台词，统计人物名称同时出现的次数。
结果展示：输出人物关系矩阵或图结构。

代码实现

import re
import matplotlib.pyplot as plt
import networkx as nx
from collections import defaultdict

# 定义主要人物列表（可根据实际情况调整）
characters = [
    "石宇", "盛京", "秀安", "尚华", "真熙", "常务", "棒球选手", "棒球女", "老人1", "老人2"
]

def read_subtitles(file_path):
    """读取字幕文件并提取文本内容"""
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # 匹配字幕中的文本部分（假设是srt格式）
    pattern = r'\d+\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n(.*?)(?:\n\n|$)'
    subtitles = re.findall(pattern, content, re.DOTALL)
    
    # 清理文本
    cleaned_subtitles = []
    for subtitle in subtitles:
        text = subtitle.strip()
        if text:
            cleaned_subtitles.append(text)
    
    return cleaned_subtitles

def analyze_cooccurrence(subtitles, characters):
    """分析人物共现关系"""
    # 初始化共现矩阵
    cooccurrence = defaultdict(lambda: defaultdict(int))
    
    # 遍历每一句台词
    for line in subtitles:
        # 检查当前台词中出现的人物
        present_characters = [char for char in characters if char in line]
        
        # 更新共现矩阵
        if len(present_characters) > 1:
            for i, char1 in enumerate(present_characters):
                for char2 in present_characters[i+1:]:
                    cooccurrence[char1][char2] += 1
                    cooccurrence[char2][char1] += 1
    
    return cooccurrence

def visualize_relations(cooccurrence):
    """可视化人物关系图"""
    G = nx.Graph()
    
    # 添加节点和边
    for char1 in cooccurrence:
        for char2 in cooccurrence[char1]:
            weight = cooccurrence[char1][char2]
            G.add_edge(char1, char2, weight=weight)
    
    # 设置节点大小和边的粗细
    node_size = [G.degree(node) * 300 for node in G.nodes()]
    edge_width = [G[u][v]['weight'] * 0.5 for u, v in G.edges()]
    
    # 绘制图形
    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(G, k=0.3)
    nx.draw_networkx_nodes(G, pos, node_size=node_size, alpha=0.7)
    nx.draw_networkx_labels(G, pos, font_size=12, font_family='SimHei')
    nx.draw_networkx_edges(G, pos, width=edge_width, alpha=0.5)
    
    plt.title("《釜山行》人物关系图", fontsize=16, fontfamily='SimHei')
    plt.axis('off')
    plt.tight_layout()
    plt.savefig("busan_relations.png", dpi=300, bbox_inches='tight')
    plt.show()

def main():
    # 请替换为实际的字幕文件路径
    subtitle_file = "busan_subtitles.srt"
    
    # 读取字幕
    subtitles = read_subtitles(subtitle_file)
    
    # 分析共现关系
    cooccurrence = analyze_cooccurrence(subtitles, characters)
    
    # 打印共现矩阵
    print("人物共现矩阵:")
    print("\t" + "\t".join(characters))
    for char1 in characters:
        row = [cooccurrence[char1][char2] for char2 in characters]
        print(f"{char1}\t" + "\t".join(map(str, row)))
    
    # 可视化关系图
    visualize_relations(cooccurrence)

if __name__ == "__main__":
    main()

代码解释

read_subtitles 函数：
- 读取字幕文件内容，使用正则表达式提取每句台词的文本部分。
- 清理文本，去除空白字符等。
analyze_cooccurrence 函数：
- 初始化一个嵌套字典作为共现矩阵。
- 遍历每句台词，检查其中出现的人物。
- 若一句台词中出现多个人物，则更新共现矩阵，增加对应人物对的共现次数。
visualize_relations 函数：
- 使用 networkx 库创建图对象。
- 根据共现矩阵添加节点和边，节点大小和边的粗细分别对应人物的度和共现次数。
- 使用 matplotlib 绘制人物关系图并保存为图片。
main 函数：
- 读取字幕文件，分析共现关系，打印共现矩阵并可视化关系图。