Python批量下载sci-hub文献

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import os,re

path = ".\\Downloaded\\"
if os.path.exists(path) == False:
 os.mkdir(path)
if os.path.exists("error.txt") ==True:
 os.remove("error.txt")
f = open("doi.txt", "r", encoding="utf-8")  #存放DOI码的.txt文件中,每行存放一篇参考文献
# (endnote导出APA6th,自行调整第一作者之后以et al代替),完毕须换行(最后一个也须换行!)
head = {\
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'\
            }  #20210607更新,防止HTTP403错误
err_num = 0

def Download(url,author_date):
 file = path + author_date + "pdf"
 if os.path.exists(file) == False:
  r = requests.get(url, headers=head)
  r.raise_for_status()
  r.encoding = r.apparent_encoding
  soup = BeautifulSoup(r.text, "html.parser")
  download_url = soup.iframe.attrs["src"]
  download_r = requests.get(download_url, headers=head)
  download_r.raise_for_status()
  with open(file, "wb+") as temp:
   temp.write(download_r.content)
   print(author_date + "pdf downloaded!")
 else:
  print(author_date + "pdf already exists!")

for line in f.readlines():
 line = line[:-1] #去换行符
 author_date = line.split(').')[0] +")."
 doi_pattern = 'doi:.+'
 if (len(re.findall(doi_pattern, line)) != 0):
  doi = line.split('doi:')[1]
  url = "https://www.sci-hub.ren/doi:" +doi + "#"
  try:
   Download(url,author_date)
  except:
   err_num = err_num + 1
   with open("error.txt", "a+", encoding="utf-8") as error:
    error.write(str(err_num) + "." + line + " occurs error!\n --download_url may be:\n" + url + "\n")
   print(line + "\n" + "Failed to download!!!")
 else:
  print(line + "\n"+"doi not found,failed to download!!!")
  err_num = err_num + 1
  with open("error.txt", "a+", encoding="utf-8") as error:
   error.write(str(err_num) + "." + line + " occurs error!\n")
f.close()

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值