pyppeteer爬虫保存图片，python爬虫，完美

大得369

已于 2024-08-27 22:08:29 修改

阅读量715

点赞数 9

文章标签：爬虫 python 开发语言

于 2024-08-27 22:00:10 首次发布

本文链接：https://blog.csdn.net/qq_34631220/article/details/141613573

版权

#pip install pyppeteer,使用 Pyppeteer（异步方案）
import asyncio
import os
import random

import requests
from pyppeteer import launch
async def main():
    browser = await launch()
    page = await browser.newPage()
    await page.goto('https://pic.sogou.com/pics?st=255&channel=vr&scene=pic_result&query=%E5%9B%BE%E7%89%87&rawQuery=%E5%9B%BE%E7%89%87&vrExpId=&vrAdParams=&hitKey=')
    # 获得标签对象
    img = await page.querySelectorAll("img")
    # 提取每个 img 标签的属性信息
    img_info = []
    for img_element in img:
        try:
            # 获得地址
            src = await img_element.getProperty('src')
            src_value = await src.jsonValue()
            # 获得图片名称
            alt = await img_element.getProperty('alt')
            alt_value = await alt.jsonValue() if alt else None
            # 获得图片格式
            if src_value:
                # 从 URL 中提取文件扩展名作为格式
                file_extension = src_value.split('.')[-1].lower()
            img_info.append({'src': src_value, '名称': alt_value, '格式': file_extension})
            # 判断url是否可以访问
            response = requests.get(src_value, timeout=5)  # 使用 requests 库进行判断，设置超时时间为 5 秒
            if response.status_code == 200:
                # 保存图片
                img_name = os.path.basename(src_value)
                print(f"可以访问:{src_value}，保存图片：{img_name}")
                # 获得随机数
                random_number = random.randint(1000000, 9000000)
                # 这个保存如果没获得名称，格式失败，所以使用随机数+png
                # with open(f'dade/{img_name}', 'wb') as f:
                with open(f'dade/{random_number}.png', 'wb') as f:
                    # 下载
                    f.write(response.content)
            else:
                print(f"不可以访问:{src_value}")

        except Exception as e:
            print(f"出错啦: {e}")
    # 打印
    print(img_info)

    await browser.close()
asyncio.get_event_loop().run_until_complete(main())