pixabay网站图片下载

pixabay 网站图片下载

准备

目标网站:https://pixabay.com/

  1. 下载这个网站的图片需要先登录,可以用临时邮箱去注册一个账号。

  2. 使用浏览器插件Cookie-Editor导出json格式的cookies。

  3. 删除cookies 数据的 "sameSite": null

这个网站api请求会返回 Cloudflare 验证页面,它提示启用 JavaScript 和 cookies 以继续浏览,这是 Cloudflare 的安全机制,用于保护网站免受恶意访问。

没有办法跳过,于是使用pyppeteer模拟浏览器操作。

编码
import asyncio
import json

from pyppeteer import launch

from spider import Spider


#  https://cdn.pixabay.com/photo/2023/08/19/05/50/wolf-8199785_1280.png
#  https://pixabay.com/images/download/wolf-8199785.png
async def parse_one_page(url):
    options = {
        "executablePath": r"C:\Program Files\Google\Chrome\Application\chrome.exe",
        'userDataDir': './temp',
        "headless": False,
        "defaultViewport": {"width": 1920, "height": 1080},
        "autoClose": True,
        "dumpio": True,
        "args": [
            '--disable-infobars',
            '--no-sandbox',
            '--start-maximized',
        ],
    }
    browser = await launch(**options)
    page = await browser.newPage()
    # cookie导入前去掉 "sameSite": null
    with open('cookies.json', 'r', encoding='utf8') as f:
        cookies_data = json.load(f)

    for cookie in cookies_data:
        await page.setCookie(cookie)
    try:
        await page.goto(url, options={'timeout': 30000})
    except:
        pass
    ###############################################
    temp = None
    while True:
        y = await page.evaluate('document.documentElement.scrollTop')
        if y == temp:
            break
        else:
            print('滚动条刷新')
            temp = y
            await page.evaluate('window.scrollBy(0, document.documentElement.clientHeight)')
            await asyncio.sleep(2)
    ##############################################
    src_list = await page.xpath('//a[@class="link--WHWzm"]/img/@src')
    download_url_list = []
    for src in src_list:
        img_url = await (await src.getProperty('textContent')).jsonValue()
        print(img_url)
        pid = img_url.split('/')[-1].split('_')[0]
        download_url = f"https://pixabay.com/images/download/{pid}.jpg"
        download_url_list.append(download_url)
    print(download_url_list)
    return download_url_list


async def parse_download_url(url, save_path):
    options = {
        # "executablePath": r"C:\Program Files\Google\Chrome\Application\chrome.exe",
        'userDataDir': './temp',
        "headless": False,
        "defaultViewport": {"width": 1920, "height": 1080},
        "autoClose": True,
        "dumpio": True,
        "args": [
            '--disable-infobars',
            '--no-sandbox',
            '--start-maximized',
        ],
    }
    browser = await launch(**options)
    page = await browser.newPage()
    with open('cookies.json', 'r', encoding='utf8') as f:
        cookies_data = json.load(f)

    for cookie in cookies_data:
        await page.setCookie(cookie)

    response = await page.goto(url)
    print(response.url)
    await browser.close()
    # 下载
    pic_name = url.split('/')[-1]
    # os.makedirs(save_path, exist_ok=True)
    save_name = f'{save_path}/{pic_name}'
    return response.url, save_name



if __name__ == '__main__':
    keyword = 'face'
    for page in range(1, 406):
        print(f'下载第{page}页数据')
        page_url = f'https://pixabay.com/photos/search/{keyword}/?pagi={page}'
        loop1 = asyncio.get_event_loop()
        download_url_list = loop1.run_until_complete(parse_one_page(page_url))
        task_list = []
        for download_url in download_url_list:
            loop2 = asyncio.get_event_loop()
            one_task = loop2.run_until_complete(parse_download_url(download_url, f'./{keyword}'))
            task_list.append(one_task)
        spider = Spider(
            task_list=task_list,
            thread_num=3
        )
        spider.run()

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值