CVPR2023论文批量下载脚本

CVPR2023论文批量下载脚本

代码

# -*- coding: utf-8 -*-
"""
# @file name : ICCV2023-PDF-downloader.py
# @author    : 蔡不菜和他的uU们 https://www.caibucai.top/
# @date      : 2023-11-22 15:31
# @brief     : CVPR 2023 paper 下载
"""
import threading
import os
import urllib3
import requests
import xml
from bs4 import BeautifulSoup



start_url = 'https://openaccess.thecvf.com/CVPR2023'
base_url = "https://openaccess.thecvf.com"
dst_dir = './CVPR2023/'

response = requests.get(start_url)
soup = BeautifulSoup(response.text, 'html.parser')
# 查找表单
form = soup.find('form')
# print(form)
action = form.get('action')
search_url = base_url + action
print('search_url: ', search_url)


# 关键词 拼接
keyword = input('please input search key:\n')
# keyword = 'annomaly'
data = {"query": keyword}  # 替换为实际的搜索参数名和值
headers = {
    "User-Agent": "LogStatistic"
}
pdf_urls = []
successful_downloads = 0
failed_urls = []

# 发送POST请求获取搜索结果页面
search_response = requests.post(search_url, data=data, headers=headers)
if search_response.status_code == 200:
    print('search success')
    search_soup = BeautifulSoup(search_response.text, "html.parser")  # 使用html.parser解析器,可根据需要选择其他解析器
    # print(search_soup)
    # 根据页面结构和标签选择器提取搜索结果信息
    pdf_links = search_soup.find_all("a")  # 替换为实际的搜索结果的CSS选择器

    # 遍历搜索结果并提取相关信息
    pdf_cnt = 0
    for pdf_link in pdf_links:
        # print(pdf_link)
        if pdf_link.text == 'pdf':
            print(pdf_link)
            pdf_urls.append(base_url + pdf_link.get('href'))
            pdf_cnt += 1

    print("find ", pdf_cnt, " papers!")
else:
    print("Failed to send, please check")

print('start downloading ')

dst_path = os.path.join(dst_dir, keyword)
if not os.path.exists(dst_path):
    os.makedirs(dst_path)

def download_pdf(url, dst_path):
    global successful_downloads
    global failed_urls
    try:
        response = requests.get(url,timeout=10)
        if response.status_code == 200:
            # 提取文件名
            file_name = url.split("/")[-1]
            # 保存文件
            with open(dst_path + '/' + file_name, "wb") as file:
                file.write(response.content)
            print(f"Downloaded: {file_name}")
            successful_downloads += 1
        else:
            print(f"Failed to download: {url}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {url}: {e}")
        failed_urls.append(url)
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


# 线程池版本
from concurrent.futures import ThreadPoolExecutor

with ThreadPoolExecutor(max_workers=5) as executor:
    executor.map(lambda url: download_pdf(url, dst_path), pdf_urls)

print(f"total {len(pdf_urls)}, success {successful_downloads}, failed {len(failed_urls)}")

if len(failed_urls) > 0:
    print("failed pdf:")
    for url in failed_urls:
        print(url)

说明

有问题,后期维护更新,欢迎留言、进群讨论或私聊:【群号:392784757】

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值