aiohttp异步爬虫

world_in_world

已于 2024-04-06 16:55:50 修改

阅读量489

点赞数

分类专栏： python爬虫文章标签： python 爬虫

于 2023-04-08 21:52:42 首次发布

本文链接：https://blog.csdn.net/world_in_world/article/details/130035283

版权

python爬虫专栏收录该内容

18 篇文章 1 订阅

订阅专栏

aiohttp模块：Welcome to AIOHTTP — aiohttp 3.9.3 documentation

一、aiohttp 代替 requests 来执行异步的网络请求操作


import os
import time
import aiohttp
import asyncio


async def func(client, i):
    response = await client.get('https://www.baidu.com')
    print(f'第{i+1}次请求，status_code={response.status}')
    return response


async def main():
    # 声明一个异步的上下文管理器，能帮助我们自动的分配和释放资源
    async with aiohttp.ClientSession() as client:
        task_list = []
        for i in range(3):
            task = asyncio.create_task(func(client, i))
            task_list.append(task)
        done, pending = await asyncio.wait(task_list)
        for j in done:
            print(f'{j.result()}')


if __name__ == '__main__':
    asyncio.run(main())

二、下面虽然用到了异步相关的函数，但逻辑上相当于用异步实现了单线程的效果


import os
import time
import aiohttp
import asyncio


class WangZhe:
    def __init__(self):
        self.herolist_url = 'https://pvp.qq.com/web201605/js/herolist.json'
        self.skin_url = 'https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{}/{}-bigskin-{}.jpg'
        self.headers = {
            'user-agent': ''
        }


    async def skin_download(self, session):
        new_result = await asyncio.create_task(self.herolist(session))
        for d in new_result[:4]:
            for i in range(1, 4):
                skin_response = await session.get(self.skin_url.format(d['ename'], d['ename'], i), headers=self.headers)
                if skin_response.status == 200:
                    content = await skin_response.read()
                    with open('../sublimetext_result/王者荣耀英雄皮肤/' + d['cname'] + '_' + str(i) + '.jpg', 'wb') as f:
                        f.write(content)
                    print(f"{d['cname']}第{i}张皮肤下载成功")
                else:
                    break

    async def herolist(self, session):
        herolist_responsed = await session.get(self.herolist_url, headers=self.headers)
        result = await herolist_responsed.json(content_type=None)  # 不使用 await 会报错
        new_result = []
        # print(result)
        for j in result:
            item = {}
            item['ename'] = j['ename']
            item['cname'] = j['cname']
            new_result.append(item)
        return new_result


    # async with 必须放在 async def 里面
    async def main(self):
        async with aiohttp.ClientSession() as session:
            await asyncio.create_task(self.skin_download(session))  # 记得传递session


if __name__ == '__main__':
    if not os.path.exists('../sublimetext_result/王者荣耀英雄皮肤'):
        os.mkdir('../sublimetext_result/王者荣耀英雄皮肤')
    w = WangZhe()
    start = time.time()
    asyncio.run(w.main())
    end = time.time()
    print(f'总耗时：{end - start}')  # 耗时14s

三、上方问题改进，另外注意对于一些返回 coroutine 的操作，前面必须加 await 来修饰，可以通过官方文档说明以明确对应操作的返回值的类型，然后决定加不加 await


'''
王者荣耀皮肤
'''
import os
import time
import aiohttp
import asyncio


class WangZhe:
    def __init__(self):
        self.skin_url = 'https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{}/{}-bigskin-{}.jpg'
        self.herolist_url = 'https://pvp.qq.com/web201605/js/herolist.json'
        self.headers = {
            'user-agent': '',
        }


    async def skin_download(self, session, ename, cname):
        for i in range(1, 4):
            response = await session.get(self.skin_url.format(ename, ename, i), headers=self.headers)
            if response.status == 200:  # aiohttp模块获取状态码使用 status
                content = await response.read()  # aiohttp模块获取进制数据使用 read()
                with open("../sublimetext_result/王者荣耀英雄皮肤2/" + cname + "_" + str(i) + '.jpg', 'wb') as f:
                    f.write(content)
                print('{}第{}张皮肤下载成功'.format(cname, str(i)))
            else:
                break


    async def main(self):
        async with aiohttp.ClientSession() as session:
            response = await session.get(self.herolist_url, headers=self.headers)
            result = await response.json(content_type=None)
            tasks = []
            for i in result[:4]:
                ename = i['ename']
                cname = i['cname']
                res = self.skin_download(session, ename, cname)
                task = asyncio.create_task(res)
                tasks.append(task)
            await asyncio.wait(tasks)


if __name__ == '__main__':
    if not os.path.exists('../sublimetext_result/王者荣耀英雄皮肤2'):
        os.mkdir('../sublimetext_result/王者荣耀英雄皮肤2')
    w = WangZhe()
    start = time.time()
    loop = asyncio.get_event_loop()
    loop.run_until_complete(w.main())
    end = time.time()
    print('总耗时：{}'.format(end - start))


'''
英雄联盟皮肤
'''
import aiohttp
import asyncio
import os
import random


class YingXiongLianMeng:
    def __init__(self):
        self.herolist_url = 'https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js?ts=2803015'
        self.skin_url = 'https://game.gtimg.cn/images/lol/act/img/js/hero/{}.js?ts=2803019'
        self.headers = {
            'user-agent': ''
        }


    async def skin_down(self, session, heroId, name):
        response = await session.get(self.skin_url.format(heroId))
        dic = await response.json(content_type=None)
        num = 0
        for i in dic['skins']:
            if i['mainImg']:
                num += 1
                img_url = i['mainImg']
                await asyncio.sleep(random.randint(3, 4)/10)
                res = await session.get(img_url)
                content = await res.content.read()
                with open(f'../图灵教育/测试结果/英雄联盟_{name}/{num}.jpg', 'wb') as f:
                    f.write(content)
                    print(f'英雄联盟_{name}/{num}.jpg……下载成功')
            else:
                continue


    async def main(self):
        async with aiohttp.ClientSession(headers=self.headers) as session:
            response = await session.get(self.herolist_url)
            dic = await response.json(content_type=None)
            tasks = []
            for i in dic['hero']:
                heroId = i['heroId']
                name = i['name']
                if not os.path.exists(f'../图灵教育/测试结果/英雄联盟_{name}'):
                    os.mkdir(f'../图灵教育/测试结果/英雄联盟_{name}')
                task = asyncio.create_task(self.skin_down(session, heroId, name))
                tasks.append(task)
            await asyncio.wait(tasks)


if __name__ == '__main__':
    yxlm = YingXiongLianMeng()
    asyncio.run(yxlm.main())