# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import os,re
path = ".\\Downloaded\\"
if os.path.exists(path) == False:
os.mkdir(path)
if os.path.exists("error.txt") ==True:
os.remove("error.txt")
f = open("doi.txt", "r", encoding="utf-8") #存放DOI码的.txt文件中,每行存放一篇参考文献
# (endnote导出APA6th,自行调整第一作者之后以et al代替),完毕须换行(最后一个也须换行!)
head = {\
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'\
} #20210607更新,防止HTTP403错误
err_num = 0
def Download(url,author_date):
file = path + author_date + "pdf"
if os.path.exists(file) == False:
r = requests.get(url, headers=head)
r.raise_for_status()
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, "html.parser")
download_url = soup.iframe.attrs["src"]
download_r = requests.get(download_url, headers=head)
download_r.raise_for_status()
with open(file, "wb+") as temp:
temp.write(download_r.content)
print(author_date + "pdf downloaded!")
else:
print(author_date + "pdf already exists!")
for line in f.readlines():
line = line[:-1] #去换行符
author_date = line.split(').')[0] +")."
doi_pattern = 'doi:.+'
if (len(re.findall(doi_pattern, line)) != 0):
doi = line.split('doi:')[1]
url = "https://www.sci-hub.ren/doi:" +doi + "#"
try:
Download(url,author_date)
except:
err_num = err_num + 1
with open("error.txt", "a+", encoding="utf-8") as error:
error.write(str(err_num) + "." + line + " occurs error!\n --download_url may be:\n" + url + "\n")
print(line + "\n" + "Failed to download!!!")
else:
print(line + "\n"+"doi not found,failed to download!!!")
err_num = err_num + 1
with open("error.txt", "a+", encoding="utf-8") as error:
error.write(str(err_num) + "." + line + " occurs error!\n")
f.close()
Python批量下载sci-hub文献
最新推荐文章于 2024-11-09 14:47:35 发布