Python 异步爬虫爬取图片

最新推荐文章于 2024-04-27 10:25:54 发布

花开不识君

最新推荐文章于 2024-04-27 10:25:54 发布

阅读量1.7k

点赞数

分类专栏： python 文章标签： python 爬虫

本文链接：https://blog.csdn.net/weixin_43779268/article/details/118655665

版权

python 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

周末没什么事,想温习一下以前学过的python爬虫,抓了一些图片,发现优点慢,于是改成了异步爬虫,用了几个异步的库

在这里插入图片描述

同步方法

import requests
from lxml import etree
from time import time
from selenium import webdriver
import os
from time import sleep

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36 Edg/86.0.622.43",
}


def preprocessing(browser):
    url = 'https://m.tuiimg.com/'
    browser.get(url=url)
    windows = browser.window_handles
    a_arr = browser.find_elements_by_xpath('//*[@id="main"]/li/a')
    for image in a_arr:
        # 图片地址 https://m.tuiimg.com/meinv/2195/
        item_href = image.get_attribute("href")
        # # 图集id
        # images_id = item_href.split('/')[-2]
        # print(images_id)
        res = requests.get(url=item_href, headers=headers).content
        tree = etree.HTML(res)
        total_text = tree.xpath('//*[@id="allbtn"]/text()')[0]
        # 图集总数量
        image_total = int(total_text.split('/')[1].replace(")", ""))
        # 图解名称
        images_name = tree.xpath('//*[@id="container"]/div[3]/h1/text()')[0]
        print(images_name)
        base_img_url = "/".join(tree.xpath('//*[@id="nowimg"]/@src')[0].split("/")[0:-1]) + "/"
        print(base_img_url)
        start_time = time()
        for i in range(1, image_total + 1):
            image_url = base_img_url + str(i) + ".jpg"
            saveImage(image_url,images_name + str(i) + ".jpg")
        print(images_name, "图集下载花费的时间是" + str(time() - start_time), "秒")


def saveImage(image_url,images_name):
    file_path = "./妹子美图合集/"
    if not os.path.exists(file_path):
        os.mkdir(file_path)
    imag = requests.get(url=image_url, headers=headers).content
    with open(file_path + images_name, "wb") as fp:
        fp.write(imag)
        print(images_name,"保存成功")


if __name__ == '__main__':
    chrome = webdriver.Chrome(executable_path='chromedriver.exe')
    preprocessing(browser=chrome)

下一组图,大概60s 于是就有了下面的代码

异步方法

import requests
import aiohttp
import asyncio
import aiofiles
from time import time
from lxml import etree
from selenium import webdriver
import os
from time import sleep

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36 Edg/86.0.622.43",
}

def preprocessing(browser):
    url = 'https://m.tuiimg.com/'
    browser.get(url=url)
    windows = browser.window_handles
    a_arr = browser.find_elements_by_xpath('//*[@id="main"]/li/a')
    for image in a_arr:
        # 图片地址 https://m.tuiimg.com/meinv/2195/
        item_href = image.get_attribute("href")
        res = requests.get(url=item_href, headers=headers).content
        tree = etree.HTML(res)
        total_text = tree.xpath('//*[@id="allbtn"]/text()')[0]
        # 图集总数量
        image_total = int(total_text.split('/')[1].replace(")", ""))
        # 图解名称
        images_name = tree.xpath('//*[@id="container"]/div[3]/h1/text()')[0]
        print(images_name)
        base_img_url = "/".join(tree.xpath('//*[@id="nowimg"]/@src')[0].split("/")[0:-1]) + "/"
        print(base_img_url)
        start_time = time()
        tasks = []
        for i in range(1, image_total + 1):
            # 耗时操作 异步处理
            tasks.append(saveImage(base_img_url + str(i) + ".jpg", images_name + str(i) + ".jpg"))
        loop = asyncio.get_event_loop()
        loop.run_until_complete(asyncio.wait(tasks))
        print(images_name, "图集下载花费的时间是" + str(time() - start_time), "秒")


async def saveImage(image_url, images_name):
    print("准备下载", images_name)
    file_path = "./妹子美图合集1/"
    if not os.path.exists(file_path):
        os.mkdir(file_path)
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(url=image_url) as response:
            async with aiofiles.open(file_path + images_name, "wb") as afp:
                await afp.write(await response.content.read())
        print(images_name, "保存成功")

if __name__ == '__main__':
    chrome = webdriver.Chrome(executable_path='chromedriver.exe')
    preprocessing(browser=chrome)

在这里插入图片描述效率大概提升了5倍,还凑和…

把那个字符串换成正则还能再快点


image_total = int(re.search('([1-9]\\d+)|[2-9]', tree.xpath('//*[@id="allbtn"]/text()')[0]).group())

base_img_url = re.sub('\\w.jpg', '', tree.xpath('//*[@id="nowimg"]/@src')[0])

花开不识君

关注

0
点赞
踩
11

收藏

觉得还不错? 一键收藏
0
评论
Python 异步爬虫爬取图片

周末没什么事,想温习一下以前学过的python爬虫,抓了一些图片,发现优点慢,于是改成了异步爬虫,用了几个异步的库同步方法import requestsfrom lxml import etreefrom time import timefrom selenium import webdriverimport osfrom time import sleepheaders = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win.
复制链接

扫一扫