任务
2023/3/20
第一外国语课程汇报。
本地路径:E:\Doctor\study\英语\词频统计
词汇任务是统计引言中最常用的词汇,然后绘制频数图、词云:
- 收集本领域英文文献
- 从文献中找出引言
- 词频统计
- 频率图、词云
算法
输入
输出
Python源代码
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 19 18:58:09 2023
@author: LJB
"""
# 根据英文文献PDF读取引言部分,并分词,去除标点、常用词后,绘制频率图、词云
import os
import wordcloud
# 将papers论文文件夹所有论文名读取到列表paper_names中
path = os.path.abspath('.')
paper_path = path + '/papers'
paper_names = os.listdir(paper_path)
# 从论文中读取引言
intro_text = ''
for paper_name in paper_names:
with open(paper_path + '/' + paper_name, 'r', encoding = 'utf-8') as fin:
intro_text += fin.read()
#lower()把全部大写变小写,split()分割字符串
intro_list = intro_text.lower().split()
# 从引言中找出频数最高的词汇
from collections import Counter
vocabulary = Counter(intro_list)
from nltk.corpus import stopwords #自然语言处理
stop_words = stopwords.words('English') #取出英文停用词
stop_words += ['al', 'al.,2020).', 'al.,', 'et', 'also']
for sw in stop_words:
del vocabulary[sw] #删除里面的停用词
most_common_words = []
for element in vocabulary.most_common(500):
most_common_words.append(element[0])
# 提取副词和比较级
from nltk import pos_tag
most_common_words_rb = [word for word, tag in pos_tag(most_common_words) if tag in ["RB","JJS"]]
print(most_common_words_rb)
words = [word for word, tag in pos_tag(intro_list) if tag in ["RB","JJS"]]
w = wordcloud.WordCloud(font_path = "msyh.ttc",width = 1000, height = 700, background_color = "white", stopwords = stop_words)
w.generate(" ".join(words))
w.to_file("most_common_words.png")