1. 读取各种txt文件
# 读取《红楼梦》
txt = open('Dream_of_the_Red_Mansion.txt', 'r', encoding='utf-8').read()
# 读取红楼梦中的人物
names = open('Red_Mansion_Dictionary.txt', 'r', encoding='utf-8').read().split('\n')
# 将红楼梦人名添加到词库
jieba.load_userdict("Red_Mansion_Dictionary.txt")
# 读取废话词库
path_list = ["cn_stopwords.txt", "baidu_stopwords.txt", "hit_stopwords.txt", "scu_stopwords.txt"]
filter_words = []
for file in path_list:
with open(f"stopwords-master/{file}", "r", encoding="utf-8") as fp:
filter_words.extend(fp.read().split("\n"))
fp.close()
2. 分词及初步筛选
# 对txt分词
words = jieba.lcut(txt)
# 进行初步筛选
words = [word for word in words if word not in filter_words]
3. 对人物多个称谓的进行合并处理1
# 对每个单词计数,并保存到字典类型counts中
counts = {}
for word in words:
# 删掉无意义的单字
if len(word) == 1:
continue
elif word in '贾母-老太太'.split('-'):
word = '贾母'
elif word in '贾珍-尤氏'.split('-'):
word = '贾珍'
elif word in '贾蓉-秦可卿'.split('-'):
word = '贾蓉'
elif word in '贾赦-邢夫人'.split('-'):
word = '贾赦'
elif word in '贾政-王夫人'.split('-'):
word = '贾政'
elif word in '袭人-蕊珠'.split('-'):
word = '袭人'
elif word in '贾琏-王熙凤'.split('-'):
word = '贾琏'
elif word in '紫鹃-鹦哥'.split('-'):
word = '紫鹃'
elif word in '翠缕-缕儿'.split('-'):
word = '翠缕'
elif word in '香菱-甄英莲'.split('-'):
word = '香菱'
elif word in '豆官-豆童'.split('-'):
word = '豆官'
elif word in '薛蝌-邢岫烟'.split('-'):
word = '薛蝌'
elif word in '薛蟠-夏金桂'.split('-'):
word = '薛蟠'
elif word in '贾宝玉-宝玉'.split('-'):
word = '贾宝玉'
elif word in '林黛玉-林姑娘-黛玉'.split('-'):
word = '林黛玉'
if word not in names:
continue
counts[word] = counts.get(word, 0) + 1
4. 对结果排序,输出出场频次前20的人名及次数
# 按照单词出现的次数从高到低排序。
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
# 为了增强排序后结果的可读性,利用format调整输出的格式
# 输出出场频次前20的人名及次数
for i in range(20):
word, count = items[i]
print("{0:<10}{1:>5}".format(word, count))
完结,如果对你有帮助,欢迎点赞、收藏、关注!