Python 异步爬虫爬取图片

周末没什么事,想温习一下以前学过的python爬虫,抓了一些图片,发现优点慢,于是改成了异步爬虫,用了几个异步的库

在这里插入图片描述

同步方法

import requests
from lxml import etree
from time import time
from selenium import webdriver
import os
from time import sleep

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36 Edg/86.0.622.43",
}


def preprocessing(browser):
    url = 'https://m.tuiimg.com/'
    browser.get(url=url)
    windows = browser.window_handles
    a_arr = browser.find_elements_by_xpath('//*[@id="main"]/li/a')
    for image in a_arr:
        # 图片地址 https://m.tuiimg.com/meinv/2195/
        item_href = image.get_attribute("href")
        # # 图集id
        # images_id = item_href.split('/')[-2]
        # print(images_id)
        res = requests.get(url=item_href, headers=headers).content
        tree = etree.HTML(res)
        total_text = tree.xpath('//*[@id="allbtn"]/text()')[0]
        # 图集总数量
        image_total = int(total_text.split('/')[1].replace(")", ""))
        # 图解名称
        images_name = tree.xpath('//*[@id="container"]/div[3]/h1/text()')[0]
        print(images_name)
        base_img_url = "/".join(tree.xpath('//*[@id="nowimg"]/@src')[0].split("/")[0:-1]) + "/"
        print(base_img_url)
        start_time = time()
        for i in range(1, image_total + 1):
            image_url = base_img_url + str(i) + ".jpg"
            saveImage(image_url,images_name + str(i) + ".jpg")
        print(images_name, "图集下载花费的时间是" + str(time() - start_time), "秒")


def saveImage(image_url,images_name):
    file_path = "./妹子美图合集/"
    if not os.path.exists(file_path):
        os.mkdir(file_path)
    imag = requests.get(url=image_url, headers=headers).content
    with open(file_path + images_name, "wb") as fp:
        fp.write(imag)
        print(images_name,"保存成功")


if __name__ == '__main__':
    chrome = webdriver.Chrome(executable_path='chromedriver.exe')
    preprocessing(browser=chrome)

下一组图,大概60s 于是就有了下面的代码

异步方法

import requests
import aiohttp
import asyncio
import aiofiles
from time import time
from lxml import etree
from selenium import webdriver
import os
from time import sleep

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36 Edg/86.0.622.43",
}

def preprocessing(browser):
    url = 'https://m.tuiimg.com/'
    browser.get(url=url)
    windows = browser.window_handles
    a_arr = browser.find_elements_by_xpath('//*[@id="main"]/li/a')
    for image in a_arr:
        # 图片地址 https://m.tuiimg.com/meinv/2195/
        item_href = image.get_attribute("href")
        res = requests.get(url=item_href, headers=headers).content
        tree = etree.HTML(res)
        total_text = tree.xpath('//*[@id="allbtn"]/text()')[0]
        # 图集总数量
        image_total = int(total_text.split('/')[1].replace(")", ""))
        # 图解名称
        images_name = tree.xpath('//*[@id="container"]/div[3]/h1/text()')[0]
        print(images_name)
        base_img_url = "/".join(tree.xpath('//*[@id="nowimg"]/@src')[0].split("/")[0:-1]) + "/"
        print(base_img_url)
        start_time = time()
        tasks = []
        for i in range(1, image_total + 1):
            # 耗时操作 异步处理
            tasks.append(saveImage(base_img_url + str(i) + ".jpg", images_name + str(i) + ".jpg"))
        loop = asyncio.get_event_loop()
        loop.run_until_complete(asyncio.wait(tasks))
        print(images_name, "图集下载花费的时间是" + str(time() - start_time), "秒")


async def saveImage(image_url, images_name):
    print("准备下载", images_name)
    file_path = "./妹子美图合集1/"
    if not os.path.exists(file_path):
        os.mkdir(file_path)
    async with aiohttp.ClientSession(headers=headers) as session:
        async with session.get(url=image_url) as response:
            async with aiofiles.open(file_path + images_name, "wb") as afp:
                await afp.write(await response.content.read())
        print(images_name, "保存成功")

if __name__ == '__main__':
    chrome = webdriver.Chrome(executable_path='chromedriver.exe')
    preprocessing(browser=chrome)

在这里插入图片描述效率大概提升了5倍,还凑和…

把那个字符串换成正则还能再快点


image_total = int(re.search('([1-9]\\d+)|[2-9]', tree.xpath('//*[@id="allbtn"]/text()')[0]).group())

base_img_url = re.sub('\\w.jpg', '', tree.xpath('//*[@id="nowimg"]/@src')[0])
  • 0
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值