python异步协程实战：wallhaven壁纸网站

海带烧鱼

已于 2022-04-28 08:45:50 修改

阅读量2.5k

点赞数 1

文章标签： python 安全

于 2022-04-26 00:30:32 首次发布

本文链接：https://blog.csdn.net/qq_56323420/article/details/124418093

版权

开发环境：Python3.9、idea

相较于线程池大约优化了50%的速度，在学校网100M宽带下，爬取一页24张需要60s

代码中默认是下载第一页的图片，想要多爬点的话加个循环就好了

url = "https://wallhaven.cc/toplist?page=1"

整套代码如下，仅供参考

import time
import aiohttp
import asyncio
import requests
import aiofiles
from lxml import etree


async def aiodownload(li):
    # 发送下载请求
    # 得到图片内容
    # 保存到文件
    # resp.content.read()  ==>  requests.content()

    async with aiohttp.ClientSession() as session:
        sub_url = "".join(li.xpath('./figure/a/@href'))
        sub_resp = requests.get(sub_url, headers=headers)  # get请求子网页
        if len(sub_resp.text) < 1000:  # 粗糙的429处理办法 :(
            await asyncio.sleep(3)
        sub_html = etree.HTML(sub_resp.text)
        img_url = "".join(sub_html.xpath('/html/body/main/section/div[1]/img/@src'))  # 图片的下载链接

        async with session.get(img_url, headers=headers) as img_resp:
            img_name = img_url.split('/')[-1]
            async with aiofiles.open("img/" + img_name, mode="wb") as f:
                await f.write(await img_resp.content.read())
                print("下载完成", img_name)
        await asyncio.sleep(0)


async def main():
    url = "https://wallhaven.cc/toplist?page=1"
    resp = requests.get(url, headers=headers)
    html = etree.HTML(resp.text)
    # 拿到图片组
    lis = html.xpath('/html/body/main/div[1]/section[1]/ul/li')
    tasks = []
    for li in lis:
        tasks.append(asyncio.create_task(aiodownload(li)))  # 将转换后的协程对象添加进入任务列表
    await asyncio.wait(tasks)


if __name__ == '__main__':
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.44"
    }
    t1 = time.time()
    print("开始下载...")
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())
    print("下载完成...")
    t2 = time.time()
    print('运行时间:', t2 - t1)

# requests.get() --> 异步操作
# 实战：wallhaven壁纸网站
import time
import aiohttp
import asyncio
import requests
import aiofiles
from lxml import etree


# 异步协程优化版，跳过获取子页面，通过拼接获取图片下载地址，从单线程100秒，到异步协程只需要20秒，500%的提升
async def aiodownload(li, ft):
    # 发送下载请求
    # 得到图片内容
    # 保存到文件
    # resp.content.read()  ==>  requests.content()

    async with aiohttp.ClientSession() as session:
        sub_url = "".join(li.xpath('./figure/a/@href'))
        url_1 = sub_url.split('/')[-1]
        url_2 = url_1[:2]
        img_url = 'https://w.wallhaven.cc/full/' + url_2 + '/wallhaven-' + url_1 + ft  # 图片的下载链接
        async with session.get(img_url, headers=headers) as img_resp:
            img_name = img_url.split('/')[-1]
            async with aiofiles.open("img/" + img_name, mode="wb") as f:
                await f.write(await img_resp.content.read())
                print("下载完成", img_name)
        await asyncio.sleep(3)


async def main():
    tasks = []
    page = int(input("请输入要下载的页数，从第一页开始:"))
    for i in range(1, page + 1):
        print(f"正在下载第{i}页")
        url = f"https://wallhaven.cc/toplist?page={i}"
        resp = requests.get(url, headers=headers)
        html = etree.HTML(resp.text)
        # 拿到图片组
        lis = html.xpath('/html/body/main/div[1]/section[1]/ul/li')
        for li in lis:
            ft = li.xpath('./figure/div/span[2]/span')
            if len(ft) == 0:
                ft = '.jpg'
            else:
                ft = '.png'
            tasks.append(asyncio.create_task(aiodownload(li, ft)))  # 将转换后的协程对象添加进入任务列表
    await asyncio.wait(tasks)


if __name__ == '__main__':
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.44"
    }
    t1 = time.time()

    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())

    print("下载完成...")
    t2 = time.time()
    print('运行时间:', t2 - t1)