Python统计西游记妖怪出场次数(使用jieba分词)

import jieba
monsters = ['国丈', '虎力大仙', '赛太岁', '鹿力大仙', '玉面公主', '白衣秀士',
 '九头虫', '黄风怪', '羊力大仙', '九灵元圣', '辟尘大王', '凌虚子',
 '黑鱼精', '如意真仙', '六耳猕猴', '美后', '黄狮精',
 '辟寒大王', '特处士', '老鼋', '寅将军', '辟暑大王', '灵感大王', '熊山君']

temp_list = []

f = open(r"西游记.txt", encoding="utf-8", mode='r')
for line in f.readlines():
    for each in line:
        if each in [' ', ' ', '\t', '\n', '。', ',', '(', ')', '/', '-',
                    '(', ')', ':', '□', '?', '!', '[', ']', ':',
                    '《', '》', '、', ';', '“', '”', '……']:
            continue
        else:
            temp_list.append(each.strip())
temp_list2 = "".join(temp_list)
counts = {} 
for monster in monsters:
    counts[monster] = temp_list2.count(monster)

items = list(counts.items())  # 将键值对转换成列表
items.sort(key=lambda x: x[1], reverse=True)  # 根据词语出现的次数进行从大到小排序

for i in range(20):
    word, count = items[i]
    print("{0:<5}{1:>5}".format(word, count))


f.close()

import os import re import numpy as np import pandas as pd from pyecharts.charts import ThemeRiver from pyecharts import options as opts 文件夹路径 extract_path = “西游记” 获取TXT文件列表 txt_files = [f for f in os.listdir(extract_path) if f.endswith(‘.txt’)] 读取所有文本 all_texts = [] for txt_file in txt_files: with open(os.path.join(extract_path, txt_file), “r”, encoding=“utf-8”, errors=“ignore”) as f: all_texts.append(f.read()) 合并所有章节文本 full_text = “\n”.join(all_texts) 分割章节 chapters = re.split(r"第[一二三四五六七八九十百千]+回", full_text)[1:] 主要人物 main_characters = [“唐僧”, “悟空”, “八戒”, “沙僧”, “妖怪”, “神仙”] 初始化人物出场统计 chapter_counts = {char: [] for char in main_characters} chapter_labels = [f"第{i + 1}回" for i in range(len(chapters))] 统计每个角色在每章节中的出现频率 for chapter in chapters: for char in main_characters: chapter_counts[char].append(chapter.count(char)) 转换为 Pandas DataFrame df = pd.DataFrame(chapter_counts, index=chapter_labels) 准备主题河流图的数据 x_data = chapter_labels # 章节作为时间轴 y_data = [] for char in main_characters: for i, count in enumerate(df[char]): y_data.append([x_data[i], count, char]) 创建主题河流图 themeriver = ( ThemeRiver(init_opts=opts.InitOpts(width=“1600px”, height=“800px”)) # 设置画布大小 .add( series_name=main_characters, # 系列名称 data=y_data, # 数据 singleaxis_opts=opts.SingleAxisOpts( pos_top=“50”, # 单轴组件距离顶部 50 pos_bottom=“50”, # 单轴组件距离底部 50 type_=“category”, # 坐标轴类型为类目轴 ), tooltip_opts=opts.TooltipOpts( trigger=“axis”, # 触发条件为坐标轴触发 axis_pointer_type=“line” # 指示器类型为直线指示器 ) ) .set_global_opts(title_opts=opts.TitleOpts(title=“《西游记》主要人物在各章节的出现频率(主题河流图)”)) ) 在 Jupyter Notebook 中直接显示图表 themeriver.render_notebook() 为什么在jupyter notebook中代码都没错,就是可视化图表无法显示 参考我的词云图代码来解决该问题,词云图如下导入需要的库 import requests from bs4 import BeautifulSoup from pyecharts.charts import WordCloud from pyecharts import options as opts import jieba from collections import Counter from IPython.display import display from urllib.parse import urljoin 查找数据 base_url = "http://www.hbust.edu.cn" response = requests.get(base_url) response.encoding = response.apparent_encoding soup = BeautifulSoup(response.text, 'html.parser') ​ colleges_section = None for a in soup.find_all('a', href=True): if "教学科研单位" in a.text: colleges_section = a break if colleges_section: colleges_url = urljoin(base_url, colleges_section['href']) response = requests.get(colleges_url) response.encoding = response.apparent_encoding soup = BeautifulSoup(response.text, 'html.parser') ​ college_links = [] for a in soup.find_all('a', href=True): href = urljoin(base_url, a['href']) if "学院" in a.text or "研究院" in a.text: college_links.append(href) 获取数据 all_text = "" failed_urls = [] ​ for college_url in college_links: try: response = requests.get(college_url) response.encoding = response.apparent_encoding college_soup = BeautifulSoup(response.text, 'html.parser') introduction = college_soup.find('p') if introduction: all_text += introduction.get_text() + "\n" except Exception as e: failed_urls.append(college_url) 分词处理 words = jieba.cut(all_text, cut_all=False) stop_words = {"的", "是", "和", "在", "了", "有"} word_list = [word for word in words if len(word) > 1 and word not in stop_words] ​ word_counts = Counter(word_list) 绘图 wordcloud = WordCloud() wordcloud.add( series_name="湖北科技学院", data_pair=list(word_counts.items()), word_size_range=[20, 100], shape='diamond' ) wordcloud.set_global_opts(title_opts=opts.TitleOpts(title="湖北科技学院二级学院简介")) ​ display(wordcloud.render_notebook())
最新发布
03-24
为了统计西游记》中主要人物的出场次数,可以使用Pythonjieba分词库来实现。首先,需要将《西游记》的文本数据加载到Python中。然后,使用jieba分词库对文本进行分词处理。 首先,我们需要下载并安装jieba分词库。可以使用以下命令安装: ``` pip install jieba ``` 接下来,导入jieba库,并加载《西游记》的文本数据: ```python import jieba # 加载《西游记》的文本数据 with open('xiyouji.txt', 'r', encoding='utf-8') as f: text = f.read() ``` 然后,使用jieba进行分词。为了去除一些停用词(如标点符号、连词等),可以加载自定义的停用词列表,并设置为jieba的停用词表: ```python # 加载停用词表 stopwords = [',', '。', '、', '的', '了', '和', '是', '在', '他', '她', '他们', '我们'] # 设置jieba的停用词表 jieba.set_stop_words(stopwords) # 使用jieba进行分词 words = jieba.lcut(text) ``` 接下来,统计主要人物出现的次数。我们可以创建一个字典来保存每个人物的出现次数,并遍历分词结果进行统计: ```python # 创建一个字典保存人物出现次数 character_count = {} # 遍历分词结果统计人物出现次数 for word in words: if word in character_count: character_count[word] += 1 else: character_count[word] = 1 ``` 最后,根据人物出现次数进行排序,并输出统计结果: ```python # 根据人物出现次数进行排序 sorted_character_count = sorted(character_count.items(), key=lambda x: x[1], reverse=True) # 输出主要人物及其出现次数 for character, count in sorted_character_count[:10]: print(character, count) ``` 以上就是使用jieba分词统计西游记》中主要人物出场次数的方法。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值