本文首先使用jieba分词提取出红楼梦人物及出现次数,然后使用pyecharts进行可视化。文本分词并非重点关注,主要是做pyecharts的可视化练习。
import pandas as pd
import jieba
import re
from pyecharts.globals import CurrentConfig, OnlineHostType
CurrentConfig.ONLINE_HOST = OnlineHostType.NOTEBOOK_HOST
分章节
def split_chapters(content, path):
# 每章之间有五个空行分割
episodes = content.split('\n\n\n\n\n\n')
# 回目标题存储
chapter_titles = []
for i in range(len(episodes)):
# 去除多余字符
e = episodes[i].replace('\n\u3000\u3000','')
# 提取回目标题,标题中间用空格隔开,最后一个替换是在某一回标题中单独出现的
title = re.search(r'回(.*?)\n', e).group().replace('回', '').replace('\u3000', ' ').replace('\n','').replace('\ue7a3',' ').strip()
chapter_titles.append(title)
# 提取内容,去除空白字符,标题与内容之间有一个空行分割
chapter_content = re.sub('[\s]', '', e.split('\n\n')[1])
# 保存
with open(path+str(i)+'.txt', 'w', encoding='utf-8') as f:
f.write(title+'\n')
f.write(chapter_content)
f.close()
return chapter_titles
分词,返回分词列表
def content_cut(content):
# 分词,生成器变为列表
segments = jieba.cut(content, cut_all=False)
segments = [s for s in segments]
return segments
停止词
def get_stop_words(num, path):
# 找到每一章节都出现的字作为停用字
stop_words = set()
for i in range(num):
# 读取内容
with open(path+str(i)+'.txt', 'r', encoding='utf-8') as f:
words = content_cut(f.readlines()[1])
f.close()
# 取交集
if i == 0:
stop_words = set(words)
else:
stop_words &= set(words)
stop_words = list(stop_words)
# 添加停止词和自定义的红楼梦中的停止词
with open('stopwords.txt', 'r', encoding='gbk') as f1:
lines = f1.readlines()
for line in lines:
stop_words.append(line.strip())
with open('红楼梦stopwords.txt', 'r', encoding='utf-8') as f2:
lines = f2.readlines()
for line in lines:
stop_words.append(line.strip())
stop_words = list(set(stop_words))
return stop_words
统一人物名称
有些是根据第一次分词结果添加的
maps = {
'贾宝玉':['宝二爷','宝兄弟','怡红公子','绛洞花主','二哥哥','天魔星', '宝玉'],
'贾母': ['史太君','老祖宗','老太太','贾母笑'],
'林黛玉': ['颦颦','颦儿','林姑娘','林妹妹','潇湘妃子','黛玉'],
'王熙凤': ['凤姐','琏二奶奶','凤辣子','凤哥儿' , '凤丫头','凤姐儿' ,'熙凤'],
'薛宝钗': ['蘅芜君','宝姐姐','宝丫头' ,'宝姑娘','宝钗','宝钗笑','宝钗道'],
'袭人': ['袭人道'],
'史湘云': ['湘云'],
'薛姨妈':['姨妈'],
'贾雨村': ['雨村'],
'小红': ['红玉','小红道','红玉道','红玉笑'],
'紫鹃': ['鹦哥'],
'贾元春': ['贵妃','元妃','元春'],
'贾迎春':['二妹妹','迎春'],
'贾探春': ['三妹妹','探春'],
'贾惜春':['四妹妹','惜春'],
'秦可卿': ['秦氏','可儿','可卿','秦氏笑'],
'李纨': ['李宫裁'],
'尤氏':['尤氏笑'],
'贾蓉':['贾蓉笑', '贾蓉道'],
'香菱': ['英莲'],
'贾政':['贾政道','贾政笑'],
'空空道人': ['那道', '道士', '道人','那道士'],'癞头和尚':['僧道', '那僧道','那僧笑'],
'冷子兴':['子兴道','子兴','子兴笑'],'林如海':['如海','如海笑']
}
def clear_words(segments, stop_words):
for i in range(len(segments)):
for k,v in maps.items():
if segments[i] in v:
segments[i]=k
# 删除停止词和一个字的
segments = [seg for seg in segments if seg not in stop_words and len(seg)>1]
return segments
每回出现人物及次数,生成df
character_list = ['贾宝玉','贾母','林黛玉','薛宝钗', '史湘云', '王熙凤', '巧姐', '王夫人',
'邢夫人', '薛姨妈', '尤氏', '贾元春', '贾迎春', '贾探春','贾惜春','秦可卿',
'尤二姐', '尤三姐', '李纨','妙玉','紫鹃','袭人','晴雯','麝月','茗烟','刑岫烟','平儿',
'鸳鸯','玉钏儿','金钏儿','赵姨娘','周瑞家的','林之孝家的','小红','坠儿','芳官','龄官','藕官','翠缕',
'夏金桂','宝蟾','香菱','甄士隐','甄宝玉','冷子兴','智能儿','娇杏','贾琏','贾政',
'贾雨村','贾珍','薛蟠','薛蟠','薛蝌','贾蓉','贾蔷','贾瑞','贾芸','贾敬&