1. 需求
美女听过我曾经帮另一个美女下载过很多文献,然后非要我帮她爬取一部分SCI文献。
哎,对美女毫无抵抗力,我最后同意了。
目的比较简单:避免手动去下载文献,因为比较多,操作也比较繁琐,因此用爬虫还是很方便的。上次爬取时,只是将文章信息,如标题、作者、摘要、发表时间、引用信息、DOI索引等,这次有一点点变化,就是需要将文章下载下来,因此有了这篇文章。
2. 网站信息
3. 下载方法
需要说明的是:此网站没有直接下载文章按钮,在找到文章后,使用文章题目(或者是DOI索引, 下载SCI文章使用DOI更多一些),去专门的SCI网站去下载。
这里我收集了好几个专门用于下载SCI文章的网站:
- https://scholar.chongbuluo.com/
- http://sci-hub.ac.cn/
- https://www.sci-hub.ren/
- https://sci-hub.se/
- https://www.ablesci.com/
- https://sci-hub.st/
- https://sci-hub.do/
总有一个适合你。
我们使用SCI网站+DOI 两者便可以组成一个URL,此URL便指向要下载的文章:
在SCI-hub网站所有此标题,然后进入要一个网页,此网页提供了该文章真正的下载链接:
可以看到此URL组成分为两部分:
- https://sci-hub.do/
- 10.1080/00224499.2016.1143441
第一部分为官网地址;第二部分为DOI。因此我们将https://pubmed.ncbi.nlm.nih.gov/
中搜索到的网页进行解析,获取到对应的DOI,之后将DOI与网站地址https://sci-hub.do/
进行拼接,然后便可以直接下载了。
4. 代码实现
- 代码中会根据不同的关键字创建相应的目录,用来存储爬取的pdf文章,自己文章的其中重要信息,方便快速找到对应的pdf。
# Author : 叨陪鲤
# Date : 2021/8/14
# Position : Beijing
import os
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from urllib import request
from urllib import error
import requests
import xlwt
import re
import time
import urllib
TotalNum=0
class Article(object):
title = ""
link = ""
doi = ""
pdf = ""
authors = ""
magz = ""
time = ""
cite = ""
snip = ""
def __init__(self):
title = "New Paper"
def html_request(url):
if url is None:
return
print("download html is :{0}".format(url))
# 如果url包含中文,则需要进行编码
# 模拟浏览器行为
# headers = {'UserAgent': str(UserAgent().random)}
headers = {'UserAgent': str(UserAgent(path="D:/BaiduNetdiskDownload/fakeuseragent.json/ua.json").random)}
req = request.Request(url, headers=headers)
try:
html = request.urlopen(req).read().decode('utf-8')
except error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return None
# print(html)
return html
def download_file(url, fileName):
# "https://sci-hub.se/10.1016/j.cmet.2016.08.009."
# 传入的URL参数是下载网页(DOI链接),在DOI网页中嵌入了真正的文章地址,因此:
# 1。 先获取下载网页
# 2. 从网页中解析真正的下载地址
# "https://twin.sci-hub.se/5944/f6efbf1a018850f488666efbf25ee81a/ducker2016.pdf"
if url == "" or url is None:
return None
doi_html=html_request(url)
# 解析doi_html内容
pattern_article = '<embed type="application/pdf" src="(.+?)" id = "pdf"></embed>'
articles = re.compile(pattern_article, re.S).findall(doi_html.replace('\n', ''))
# print(articles)
for article in articles: # 大多数情况下只有一个文章链接,因此处理完毕后立即break
if article.startswith("https:"):
tmp=0
else:
article = "{0}".format("https:"+article)
print("========>"+article)
r = requests.get(article)
with open(fileName, "wb") as code:
code.write(r.content)
break
def save_xls(sheet,paper):
# 将数据按列存储入excel表格中
global TotalNum
sheet.write(TotalNum, 0, TotalNum)
sheet.write(TotalNum, 1, paper.title)
sheet.write(TotalNum, 2, paper.link)
sheet.write(TotalNum, 3, paper.doi)
sheet.write(TotalNum, 4, paper.pdf)
sheet.write(TotalNum, 5, paper.authors)
sheet.write(TotalNum, 6, paper.magz)
sheet.write(TotalNum, 7, paper.time)
sheet.write(TotalNum, 8, paper.Cite)
sheet.write(TotalNum, 9, paper.Snip)
TotalNum += 1
# 最初用来调试解析页面用的
def html_parser0():
if url is None or html is None:
return
# 使用正则匹配所有的文章列表
pattern_article = '<article class="full-docsum" data-rel-pos=(.+?)</article>'
articles = re.compile(pattern_article, re.S).findall(html.replace('\n', ''))
# 遍历每一个文章的相关信息
for article in articles:
soup = BeautifulSoup(article, 'html.parser')
title = soup.find('a', attrs={'class': 'docsum-title'})
# print("[Title]:{0}".format(title.text.replace(' ', '')))
# print("[Link]:{0}{1}".format("https://pubmed.ncbi.nlm.nih.gov", title.attrs['href']))
authors = soup.find('span', attrs={'class': 'docsum-authors full-authors'})
# print("[Author]:{0}".format(authors.text))
citationInfos = soup.find('span', attrs={'class': 'docsum-journal-citation full-journal-citation'})
Mtd = "{0}".format(citationInfos.text).split('.')
# print("[MAGZ]:{0}".format(Mtd[0]))
# print("[Time]:{0}".format(Mtd[1].split(';')[0]))
# print("[DOI]:{0}".format(Mtd[2].split(':')[1]))
citation = soup.find('span', attrs={'class': 'citation-part'})
# print("[Cite]:{0}".format(citation.text.split(':')[1]))
citation = soup.find('div', attrs={'class': 'full-view-snippet'})
# print("[Snip]:{0}\n".format(citation.text).replace(' ', ''))
def html_parser(sheet, html):
if url is None or html is None:
return
# 使用正则匹配所有的文章列表
pattern_article = '<article class="full-docsum" data-rel-pos=(.+?)</article>'
articles = re.compile(pattern_article, re.S).findall(html.replace('\n', ''))
print(len(articles))
# 遍历每一个文章的相关信息
for article in articles:
paper = Article() # 创建一个对象,用来存储文章信息
soup = BeautifulSoup(article, 'html.parser')
# 分别用来获取不同的关键信息
title = soup.find('a', attrs={'class': 'docsum-title'})
authors = soup.find('span', attrs={'class': 'docsum-authors full-authors'})
citationInfos = soup.find('span', attrs={'class': 'docsum-journal-citation full-journal-citation'})
Mtd = "{0}".format(citationInfos.text).split("doi: ")
cite = soup.find('span', attrs={'class': 'citation-part'})
snip = soup.find('div', attrs={'class': 'full-view-snippet'}) # 摘要
# print(Mtd)
# 将信息存储在paper对象上
paper.title = "{0}".format(title.text.replace(' ', ''))
paper.link = "{0}{1}".format("https://pubmed.ncbi.nlm.nih.gov",title.attrs['href'])
paper.authors = "{0}".format(authors.text)
paper.magz = "{0}".format(Mtd[0])
if len(Mtd)>1:
paper_time = "{0}".format(Mtd[1].split(';'))
if len(paper_time)>0:
paper.time = "{0}".format(Mtd[1].split(';')[0])
doi = Mtd[1].split(" ")
if len(doi)>0:
paper.doi = "{0}".format("https://sci-hub.se/"+doi[0])
else:
paper.doi = ""
Cite = "{0}".format(cite.text.replace(' ', '').split(':'))
if len(Cite)>1:
paper.Cite = Cite[1]
paper.Snip = "{0}".format(snip.text).replace(' ', '')
global TotalNum
paper.pdf = "{0}".format("No" + str(TotalNum) + ".pdf")
# 将文章下载到本地
if False == os.path.exists(paper.pdf):
download_file(paper.doi, paper.pdf)
TotalNum += 1
# save_xls(sheet, paper)
if __name__ == '__main__':
current_path = os.getcwd()
print(current_path)
target_path='Isotope tracing tumor'
if False==os.path.exists(target_path):
os.mkdir(target_path)
os.chdir(target_path)
myxls = xlwt.Workbook()
sheet1 = myxls.add_sheet(u'PaperInfo',True)
excel_name = ""+target_path+'.xls'
column = ['序号','文章名称','原文链接','DOI','PDF名称','作者','发表周刊','发表时间','引用次数','摘要']
for i in range(0, len(column)):
sheet1.write(TotalNum, i, column[i])
TotalNum+=1
page = 1
while page <= 50:
# url = "https://pubmed.ncbi.nlm.nih.gov/?term=Isotope%20tracing%20cancer&page="+str(page)
url = "https://pubmed.ncbi.nlm.nih.gov/?term=Isotope+tracing+tumor&page="+str(page)
html = html_request(url)
html_parser(sheet1, html)
# myxls.save(excel_name)
page += 1
# myxls.save(excel_name)
5. 爬取结果