- 将你需要下载的arxiv文章链接或者arxiv number,分行写入
arxiv_paper_list.txt
形如
https://arxiv.org/pdf/2006.09238
https://arxiv.org/pdf/2009.14178
https://arxiv.org/pdf/2101.01373
- 运行
arxivPaperDownload.py
,代码基于项目(https://github.com/Tachyu/Arxiv-download/tree/c9eb7cdcff80a86c7df4db062572a7439cf5383c)
import time
from selenium import webdriver
import requests
import threading
import os
def Handler(start, end, url, filename):
# specify the starting and ending of the file
headers = {'Range': 'bytes=%d-%d' % (start, end)}
# request the specified part and get into variable
r = requests.get(url, headers=headers, stream=True)
# open the file and write the content of the html page into file.
with open(filename, "r+b") as fp:
fp.seek(start)
var = fp.tell()
fp.write(r.content)
def download_file(url_of_file,name,number_of_threads):
r = requests.head(url_of_file)
if name:
file_name = name
else:
file_name = url_of_file.split('/')[-1]
try:
file_size = int(r.headers['content-length'])
except:
print("Invalid URL")
return
part = int(file_size) / number_of_threads
fp = open(file_name, "wb")
fp.close()
for i in range(number_of_threads):
start = int(part * i)
end = int(start + part)
# create a Thread with start and end locations
t = threading.Thread(target=Handler,
kwargs={'start': start, 'end': end, 'url': url_of_file, 'filename': file_name})
t.setDaemon(True)
t.start()
main_thread = threading.current_thread()
for t in threading.enumerate():
if t is main_thread:
continue
t.join()
info_txt_path = '/data/amax/users/liuwenzhe/paper/code/arxiv_paper_list.txt' # todo:1
save_path = '/data/amax/users/liuwenzhe/paper/lily_202102/Deep Learning for Generic Object Detection A Survey' # todo: 2
file = open(info_txt_path)
for line in file.readlines():
line = line.strip("\n")
# print((line))
# if line != '\n':
pdf_url = line+'.pdf'
filename = pdf_url[-14:]
print('filename:{}, pdf_url:{}.'.format(filename,pdf_url))
# pdf_url = 'https://arxiv.org/pdf/1709.06508.pdf'
print('\nDownloading {} ...'.format(filename))
# pdf_url = 'https://arxiv.org/pdf/{}.pdf'.format(arxiv_id)
# filename = filename_replace(paper_title) + '.pdf'
ts = time.time()
download_file(url_of_file=pdf_url, name=os.path.join(save_path,filename),number_of_threads=1)
te = time.time()
print('{:.0f}s [Complete] {}'.format(te-ts, filename))
- 运行
downloadedArxivRename.py
,将下载好的arxiv pdf文件重命名为文章的题目。
import requests
from bs4 import BeautifulSoup
import os
import sys
def main(folder_path=None):
files = [i for i in os.listdir(folder_path) if not os.path.isdir(i)] # Ignore subdirectories
for filename in files:
page = requests.get("https://arxiv.org/abs/"+filename[:-4]) # https://arxiv.org/abs/1910.05401
if page.status_code == 200: # Check if website exists
soup = BeautifulSoup(page.content,'html.parser')
tags = soup.find("h1","title mathjax")
x = list(tags)
file_name = str(x[1].strip())
file_name = file_name.replace(" ","_")
file_name = file_name.replace("-","_")
file_name = file_name+'.pdf'
os.chdir(folder_path)
print('Renaming {} to {} ...'.format(filename,file_name))
# print (os.path.exists(os.path.join(folder_path,filename)))
# print('Renaming {} to {} ...'.format(os.path.join(folder_path,filename),os.path.join(folder_path,file_name)))
# print('new folder: ',os.path.join(folder_path[:-6],file_name))
os.rename(os.path.join(folder_path,filename),os.path.join(folder_path[:-6],file_name))
else:
print ("Not an arxiv paper!")
if __name__=="__main__":
folder_path = '/data/amax/users/liuwenzhe/paper/lily_202102/Texture Classification in Extreme Scale Variations using GANet/arxiv'
main(folder_path=folder_path)
项目链接:https://github.com/rrryan2016/CV_conference_paper_download/tree/main/arxiv