Python批量下载sci-hub文献

最新推荐文章于 2024-07-22 10:35:36 发布

丘比特爱睡觉

最新推荐文章于 2024-07-22 10:35:36 发布

阅读量2.1k

点赞数 3

文章标签： python

本文链接：https://blog.csdn.net/weixin_45744832/article/details/122478919

版权

# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import os,re

path = ".\\Downloaded\\"
if os.path.exists(path) == False:
 os.mkdir(path)
if os.path.exists("error.txt") ==True:
 os.remove("error.txt")
f = open("doi.txt", "r", encoding="utf-8")  #存放DOI码的.txt文件中，每行存放一篇参考文献
# （endnote导出APA6th，自行调整第一作者之后以et al代替），完毕须换行（最后一个也须换行！）
head = {\
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'\
            }  #20210607更新，防止HTTP403错误
err_num = 0

def Download(url,author_date):
 file = path + author_date + "pdf"
 if os.path.exists(file) == False:
  r = requests.get(url, headers=head)
  r.raise_for_status()
  r.encoding = r.apparent_encoding
  soup = BeautifulSoup(r.text, "html.parser")
  download_url = soup.iframe.attrs["src"]
  download_r = requests.get(download_url, headers=head)
  download_r.raise_for_status()
  with open(file, "wb+") as temp:
   temp.write(download_r.content)
   print(author_date + "pdf downloaded!")
 else:
  print(author_date + "pdf already exists!")

for line in f.readlines():
 line = line[:-1] #去换行符
 author_date = line.split(').')[0] +")."
 doi_pattern = 'doi:.+'
 if (len(re.findall(doi_pattern, line)) != 0):
  doi = line.split('doi:')[1]
  url = "https://www.sci-hub.ren/doi:" +doi + "#"
  try:
   Download(url,author_date)
  except:
   err_num = err_num + 1
   with open("error.txt", "a+", encoding="utf-8") as error:
    error.write(str(err_num) + "." + line + " occurs error!\n --download_url may be:\n" + url + "\n")
   print(line + "\n" + "Failed to download!!!")
 else:
  print(line + "\n"+"doi not found,failed to download!!!")
  err_num = err_num + 1
  with open("error.txt", "a+", encoding="utf-8") as error:
   error.write(str(err_num) + "." + line + " occurs error!\n")
f.close()