基于Python的Arxiv文章批量下载

  1. 将你需要下载的arxiv文章链接或者arxiv number,分行写入arxiv_paper_list.txt
    形如
    https://arxiv.org/pdf/2006.09238
    https://arxiv.org/pdf/2009.14178
    https://arxiv.org/pdf/2101.01373
  2. 运行arxivPaperDownload.py,代码基于项目(https://github.com/Tachyu/Arxiv-download/tree/c9eb7cdcff80a86c7df4db062572a7439cf5383c)
import time 
from selenium import webdriver
import requests
import threading
import os 

def Handler(start, end, url, filename): 
    # specify the starting and ending of the file 
    headers = {'Range': 'bytes=%d-%d' % (start, end)} 
    # request the specified part and get into variable     
    r = requests.get(url, headers=headers, stream=True) 
    # open the file and write the content of the html page into file. 
    with open(filename, "r+b") as fp: 
        fp.seek(start) 
        var = fp.tell() 
        fp.write(r.content)

def download_file(url_of_file,name,number_of_threads): 
    r = requests.head(url_of_file) 
    if name: 
        file_name = name 
    else: 
        file_name = url_of_file.split('/')[-1] 
    try: 
        file_size = int(r.headers['content-length']) 
    except: 
        print("Invalid URL")
        return

    part = int(file_size) / number_of_threads 
    fp = open(file_name, "wb") 
    fp.close() 
    for i in range(number_of_threads): 
        start = int(part * i) 
        end = int(start + part) 
        # create a Thread with start and end locations 
        t = threading.Thread(target=Handler, 
            kwargs={'start': start, 'end': end, 'url': url_of_file, 'filename': file_name}) 
        t.setDaemon(True) 
        t.start() 

    main_thread = threading.current_thread() 
    for t in threading.enumerate(): 
        if t is main_thread: 
            continue
        t.join() 

info_txt_path = '/data/amax/users/liuwenzhe/paper/code/arxiv_paper_list.txt' # todo:1
save_path = '/data/amax/users/liuwenzhe/paper/lily_202102/Deep Learning for Generic Object Detection A Survey' # todo: 2

file = open(info_txt_path)
for line in file.readlines():
    line = line.strip("\n")
    # print((line))
    # if line != '\n':
    pdf_url = line+'.pdf'
    filename = pdf_url[-14:]
    print('filename:{}, pdf_url:{}.'.format(filename,pdf_url))

    # pdf_url = 'https://arxiv.org/pdf/1709.06508.pdf'

    print('\nDownloading {} ...'.format(filename))
    # pdf_url = 'https://arxiv.org/pdf/{}.pdf'.format(arxiv_id)
    # filename = filename_replace(paper_title) + '.pdf'
    ts = time.time()
    download_file(url_of_file=pdf_url, name=os.path.join(save_path,filename),number_of_threads=1) 
    te = time.time()
    print('{:.0f}s [Complete] {}'.format(te-ts, filename))
  1. 运行downloadedArxivRename.py,将下载好的arxiv pdf文件重命名为文章的题目。
import requests
from bs4 import BeautifulSoup
import os
import sys

def main(folder_path=None):
	files = [i for i in os.listdir(folder_path) if not os.path.isdir(i)] # Ignore subdirectories
	for filename in files:
		page = requests.get("https://arxiv.org/abs/"+filename[:-4]) # https://arxiv.org/abs/1910.05401
		if page.status_code == 200: # Check if website exists
			soup = BeautifulSoup(page.content,'html.parser')
			tags = soup.find("h1","title mathjax")
			x = list(tags)
			file_name = str(x[1].strip())
			file_name = file_name.replace(" ","_")
			file_name = file_name.replace("-","_")
			file_name = file_name+'.pdf'
			
			os.chdir(folder_path)
			print('Renaming {} to {} ...'.format(filename,file_name))

			# print (os.path.exists(os.path.join(folder_path,filename)))
			# print('Renaming {} to {} ...'.format(os.path.join(folder_path,filename),os.path.join(folder_path,file_name)))
			# print('new folder: ',os.path.join(folder_path[:-6],file_name))
			os.rename(os.path.join(folder_path,filename),os.path.join(folder_path[:-6],file_name))
		else:
			print ("Not an arxiv paper!")

if __name__=="__main__":
	folder_path = '/data/amax/users/liuwenzhe/paper/lily_202102/Texture Classification in Extreme Scale Variations using GANet/arxiv'
	main(folder_path=folder_path)

项目链接:https://github.com/rrryan2016/CV_conference_paper_download/tree/main/arxiv

  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值