文本分析-统计《西游记》小说中主要角色的出现频率

最新推荐文章于 2025-05-22 23:28:02 发布

jiege1024

最新推荐文章于 2025-05-22 23:28:02 发布

阅读量319

点赞数 3

文章标签：人工智能 python

本文链接：https://blog.csdn.net/jiege1024/article/details/147165069

版权

import jieba
import time
import os
from collections import Counter

def get_chinese_text(file_path):
    """
    读取文件内容并返回
    
    Args:
        file_path: 文件路径
        
    Returns:
        文件内容的分词列表
    """
    try:
        # 尝试多种编码方式打开文件
        encodings = ['ANSI', 'UTF-8', 'GBK', 'GB18030']
        for encoding in encodings:
            try:
                with open(file_path, "r", encoding=encoding) as fp:
                    text = fp.read()
                print(f"成功使用 {encoding} 编码打开文件")
                return jieba.lcut(text)
            except UnicodeDecodeError:
                continue
        
        # 如果所有编码都失败
        raise Exception("无法使用已知编码打开文件")
    except Exception as e:
        print(f"读取文件时出错: {e}")
        return []

def count_chinese_words(word_list):
    """
    统计中文词语出现频率，并合并相同人物的不同称呼
    
    Args:
        word_list: 分词后的词语列表
        
    Returns:
        词频统计字典
    """
    # 定义要排除的词语
    exclude_words = [
        '好', '其他', '那里', '怎么', '我们', '妖精', '和尚', '两个', '甚么', '不是', 
        '国王', '土地', '徒弟', '原来', '如何', '这个', '闻言', '不曾', '今日', '不敢', 
        '陛下', '人马', '不知', '汉中', '一人', '众将', '只见', '后主', '上马', '大叫', 
        '此人', '一个', '菩萨', '却说', '师父', '一声', '不得', '出来', '不见', '如此',
        '自己', '说道', '知道', '起来', '回来', '过来', '不要', '一日', '一面', '只得',
        '不能', '那个', '东西', '一把', '所以', '不过', '一些', '什么', '没有', '就是',
        '可以', '这样', '那些', '一起', '一下', '一只', '一件', '一座', '一回', '一路'
    ]
    
    # 定义人物别名映射
    character_aliases = {
        "唐僧": ['江流', '唐三藏', '大唐御弟', '御弟哥哥', '唐玄奘', '唐长老', '圣僧', '大唐和尚', '唐僧', '长老', '三藏'],
        "孙悟空": ['弼马温', '孙行者', '齐天大圣', '斗战胜佛', '孙猴子', '美猴王', "孙悟空", "行者", '大圣', '悟空', '老孙'],
        "猪八戒": ['悟能', '猪刚鬣', '天蓬元帅', '净坛使者', '八戒', '二师兄', '老猪', '呆子'],
        "沙僧": ['沙和尚', '沙悟净', '沙僧']
    }
    
    # 创建反向映射字典，用于快速查找
    alias_to_character = {}
    for character, aliases in character_aliases.items():
        for alias in aliases:
            alias_to_character[alias] = character
    
    # 统计词频
    word_counts = {}
    
    for word in word_list:
        # 跳过单字词和排除词
        if len(word) == 1 or word in exclude_words:
            continue
        
        # 处理人物别名
        if word in alias_to_character:
            word = alias_to_character[word]
            
        # 更新词频
        word_counts[word] = word_counts.get(word, 0) + 1
    
    return word_counts

def sort_word_counts(word_counts, top_n=None):
    """
    按词频降序排序
    
    Args:
        word_counts: 词频统计字典
        top_n: 返回前N个结果，默认返回全部
        
    Returns:
        排序后的词频列表
    """
    sorted_items = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    
    if top_n:
        return sorted_items[:top_n]
    return sorted_items

def print_word_frequency(sorted_items, top_n=30):
    """
    打印词频统计结果
    
    Args:
        sorted_items: 排序后的词频列表
        top_n: 打印前N个结果
    """
    print("\n{0:-^50}".format(" 西游记人物词频统计 "))
    print("{0:<10}{1:<10}{2:<10}".format("排名", "词语", "出现次数"))
    print("-" * 30)
    
    for i, (word, count) in enumerate(sorted_items[:top_n], 1):
        print("{0:<10}{1:<10}{2:<10}".format(i, word, count))

def save_results(sorted_items, output_file, top_n=100):
    """
    将结果保存到文件
    
    Args:
        sorted_items: 排序后的词频列表
        output_file: 输出文件路径
        top_n: 保存前N个结果
    """
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write("排名,词语,出现次数\n")
            for i, (word, count) in enumerate(sorted_items[:top_n], 1):
                f.write(f"{i},{word},{count}\n")
        print(f"\n结果已保存到 {output_file}")
    except Exception as e:
        print(f"保存结果时出错: {e}")

def main():
    """主函数"""
    start_time = time.time()
    
    # 文件路径
    file_path = "西游记全集（吴承恩）.txt"
    
    # 检查文件是否存在
    if not os.path.exists(file_path):
        print(f"错误: 文件 '{file_path}' 不存在!")
        return
    
    print(f"开始分析 '{file_path}'...")
    
    # 读取并分词
    word_list = get_chinese_text(file_path)
    if not word_list:
        return
    
    print(f"分词完成，共有 {len(word_list)} 个词语")
    
    # 统计词频
    word_counts = count_chinese_words(word_list)
    print(f"词频统计完成，共有 {len(word_counts)} 个不同词语")
    
    # 排序
    sorted_items = sort_word_counts(word_counts)
    
    # 打印结果
    print_word_frequency(sorted_items)
    
    # 保存结果
    output_file = "西游记词频统计结果.csv"
    save_results(sorted_items, output_file)
    
    # 显示主要人物统计
    main_characters = ["唐僧", "孙悟空", "猪八戒", "沙僧"]
    print("\n{0:-^50}".format(" 主要人物词频统计 "))
    for character in main_characters:
        count = next((count for word, count in sorted_items if word == character), 0)
        print(f"{character}: {count} 次")
    
    # 显示执行时间
    end_time = time.time()
    print(f"\n分析完成，耗时 {end_time - start_time:.2f} 秒")

if __name__ == "__main__":
    main()

开始分析 '西游记全集（吴承恩）.txt'...
成功使用 ANSI 编码打开文件
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ROMANT~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.584 seconds.
Prefix dict has been built successfully.
分词完成，共有 501780 个词语
词频统计完成，共有 45696 个不同词语

------------------- 西游记人物词频统计 --------------------
排名        词语        出现次数
------------------------------
1         孙悟空       6259
2         唐僧        2865
3         猪八戒       2322
4         沙僧        829
5         大王        379
6         正是        353
7         只是        343       
8         那怪        340
9         真个        295
10        小妖        285
11        这里        283
12        兄弟        269
13        宝贝        266
14        取经        262
15        如今        258
16        三个        258
17        这般        248
18        铁棒        231
19        认得        222
20        妖怪        215
21        师徒        214
22        果然        212
23        老者        212
24        上前        210
25        性命        204
26        有些        203
27        孙大圣       201
28        如来        201
29        你们        196
30        太子        191

结果已保存到 西游记词频统计结果.csv

-------------------- 主要人物词频统计 --------------------
唐僧: 2865 次
孙悟空: 6259 次
猪八戒: 2322 次
沙僧: 829 次

分析完成，耗时 4.17 秒