爬取pubmed标题页面,然后统计词频,生成词云可以更直观的让我们了解到某个方向研究的热点或者趋势是什么,上效果图:
需要输入的网址是在pubmed里搜索后生成的网址,可以是左侧进行各种过滤后的。
1 爬取标题:
import requests
from bs4 import BeautifulSoup
import re
import lxml
#生成网址
start_url = input('输入网址:')
page = input('输入你想搜索前多少页:')
for i in range(int(page)):
url = start_url + "&page=" + str(int(i)+1)
#爬取网页
r = requests.get(url, headers= {'user-agent':'Mozilla/5.0'})
r.raise_for_status()
r.encoding = r.apparent_encoding
html = r.text
#提取信息
soup = BeautifulSoup(html, 'lxml')
for paper in soup.find_all('a'):
if "docsum-title" in str(paper):
name = str(paper).split('">')[-1]
title = re.sub(r'(</a>|<b>|</b>)', '', name).strip()
with open('deep-sea-10-out.txt', 'a', encoding='utf-8') as out_file:
out_file.write(title + '\n')
2 有的时候我们所搜索的研究,关键词有不一样的表述方法,需要多爬取几次,然后过滤一下相同的结果就好了:
#合并相同文章
#读取第一篇文字
title1 = []
file1 = open("marine bacteria-out.txt", "r", encoding="utf-8")
for line1 in file1:
title1.append(line1.strip())
file1.close()
print(len(title1))
#读取第二篇文章
title2 = []
file2 = open("marine bacterium-out.txt", "r", encoding="utf-8")
for line2 in file2:
title2.append(line2.strip())
file2.close()
print(len(title2))
#合并结果
title1.extend(title2)
print(len(title1))
for article in title1:
with open('all-out.txt', 'a', encoding='utf-8') as out_file:
out_file.write(article + '\n')
3 统计词频与生成词云:
import wordcloud
#常见词汇总,将后面统计词频后无意义的词加进来,不统计
excludes = {'a', 'an', 'and', 'of', 'from', 'for', 'in', 'by', 'the', 'with', 'to', 'on', 'among', 'us', 'study',
'use', 'at', 'after', 'between', 'as'}
#输入文件
in_file = 'all-out.txt'
#定义读取格式化函数
def getText():
txt = open(in_file, "r").read()
txt = txt.lower()
for ch in '|"$%&()*+,-./:;<=>?@[\\]^_‘{|}~': #去除各类标点符号,替换为空格
txt = txt.replace(ch, "")
return txt #得到的是一个以空格分隔的字符串,内容为所有输入文本
#读取
inputTxt = getText()
words = inputTxt.split() #得到的是一个以单个单词为内容的列表
words2 = words[:]
#生成词云
for remove_word in words2:
if remove_word in excludes:
words2.remove(remove_word)
else:
continue
words3 = ' '.join(words2)
w = wordcloud.WordCloud(font_path="msyh.ttc", width=1000, height= 700,
background_color="white", max_words=50) #max_words=15设置最多显示词数
w.generate(words3)
w.to_file("marine.jpg") #生成的文件名
###########################################################
counts = {}
#将同义词合并并计数
for word in words:
if word == "cutinases" or word == "cutinase": #合并同义词,有多个则继续加elif
rword = "cutinase"
else:
rword = word
counts[rword] = counts.get(rword, 0) + 1
#去除常见词
for word in excludes:
if counts.get(word):
del counts[word]
else:
continue
#格式化输出
items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)
for i in range(50): #输出前50位的
word, count = items[i]
print('{0:<20}{1:>5}'.format(word, count))