import re
import requests
import urllib.request
import os
import socket
#get web context
r=requests.get('https://openaccess.thecvf.com/CVPR2023?day=all')
data=r.text
#find all pdf links
link_list=re.findall(r"(?<=href=\").+?pdf(?=\">pdf)", data)
name_list=re.findall(r"(?<=paper.html\">).+(?=</a>)", data)
socket.setdefaulttimeout(30)
cnt=0
num=len(link_list)
#your localpath todownload pdf files
localDir='./CVPR2023/'
os.makedirs(localDir, exist_ok=True)
while cnt<num:
url=link_list[cnt]
#seperatefilenamefromurllinks
file_name=name_list[cnt]
#to avoid some illegal punctuation in file name
file_name=file_name.replace(':','_')
file_name=file_name.replace('\"','_')
file_name=file_name.replace('?','_')
file_name=file_name.replace('/','_')
file_name=file_name.replace('','_')
file_path=localDir+file_name+'.pdf'
if os.path.exists(file_path):
print('[{}.pdf]exists, skip downloading.'.format(file_name))
cnt=cnt+1
continue
else:
#download pdf files
print(f"[{cnt}/{num}]] Downloading->"+ file_path)
try:
urllib.request.urlretrieve('http://openaccess.thecvf.com/'+url, file_path)
except:
cnt=cnt+1
continue
cnt=cnt+1
print("all download finished")
引用自2359篇CVPR2023论文开放下载链接 - 知乎 (zhihu.com)
Colabhttps://colab.research.google.com/drive/1m10ifBq6YKKchsa4oXhuA5B-lEIvH4Jj?usp=sharinghttps://colab.research.google.com/drive/1m10ifBq6YKKchsa4oXhuA5B-lEIvH4Jj?usp=sharing