python 线程池/AIO（异步非阻塞）发送http请求示例

Jason不在家

已于 2023-10-27 09:58:07 修改

阅读量314

点赞数 2

文章标签： python http aiohttp asyncio

于 2023-10-25 16:40:30 首次发布

本文链接：https://blog.csdn.net/m0_37634723/article/details/134031578

版权

分别测试了多线程、线程池、aio 等模式

线程受CPU调度限制，大请求量下 AIO 效率最高，详见代码

注释中有详细说明，main 方法中是程序入口


"""
python 各种方式发起 http 请求对比
参考：
https://plainenglish.io/blog/send-http-requests-as-fast-as-possible-in-python-304134d46604
"""
import time

import requests
from requests.sessions import Session
from threading import Thread,local
from queue import Queue
from concurrent.futures import ThreadPoolExecutor


def sync_get(url_list: list):
    """
    同步方式请求
    :param url_list:
    :return:
    """

    def download_link(url: str) -> None:
        result = requests.get(url).content
        print(f'Read {len(result)} from {url}')

    def download_all(urls: list) -> None:
        for url in urls:
            download_link(url)

    start = time.time()
    download_all(url_list)
    end = time.time()
    print(f'sync download {len(url_list)} links in {end - start} seconds')


def sync_get_share_session(url_list: list):
    """
    依然是同步请求，但是可以共享 Session
    共享 Session 可以记录 cookie，保持登录状态
    复用 TCP 减少 TLS 时间等
    详情可以搜索【python session 优化 requests 性能】
    TODO 根据情况补充：session 和 cookie，后面安全经常会用到
    :param url_list:
    :return:
    """
    def download_link(url: str, session: Session):
        with session.get(url) as response:
            result = response.content
            print(f'Read {len(result)} from {url}')

    def download_all(urls: list):
        with requests.Session() as session:
            for url in urls:
                download_link(url, session=session)

    start = time.time()
    download_all(url_list)
    end = time.time()
    print(f'download {len(url_list)} links in {end - start} seconds')


def multi_thread_get(url_list):
    """
    多线程请求，启动10个线程
    要点：
    1. 多线程操作 list 时会存在线程安全问题，这里使用 queue 解决，关键词【python queue 线程安全队列】
    2. 多线程共享 Session 对象会有线程安全问题，这里使用 thread_local 解决，保证一个线程只有一个 session
    :param url_list:
    :return:
    """
    # 将压入线程安全队列
    q = Queue(maxsize=0)  # Use a queue to store all URLs
    for url in url_list:
        q.put(url)
    thread_local = local()  # The thread_local will hold a Session object

    def get_session() -> Session:
        if not hasattr(thread_local, 'session'):
            thread_local.session = requests.Session()  # Create a new Session if not exists
        return thread_local.session

    def download_link() -> None:
        '''download link worker, get URL from queue until no url left in the queue'''
        session = get_session()
        while True:
            # 全部执行完成后线程会一直卡住，需要手动停止
            # 设置超时时间的话，可以在 $timeout 秒后抛异常终止
            url = q.get(block=True, timeout=None)
            with session.get(url) as response:
                print(f'Read {len(response.content)} from {url}')
            q.task_done()  # tell the queue, this url downloading work is done

    def download_all() -> None:
        '''Start 10 threads, each thread as a wrapper of downloader'''
        thread_num = 10
        for i in range(thread_num):
            t_worker = Thread(target=download_link)
            t_worker.start()
        q.join()  # main thread wait until all url finished downloading

    print("start work")
    start = time.time()
    download_all()
    end = time.time()
    print(f'download {len(url_list)} links in {end - start} seconds')


def thread_pool_get(url_list):
    thread_local = local()

    def get_session() -> Session:
        if not hasattr(thread_local, 'session'):
            thread_local.session = requests.Session()
        return thread_local.session

    def download_link(url: str):
        session = get_session()
        with session.get(url) as response:
            print(f'Read {len(response.content)} from {url}')

    def download_all() -> None:
        with ThreadPoolExecutor(max_workers=10) as executor:
            executor.map(download_link, url_list)

    start = time.time()
    download_all()
    end = time.time()
    print(f'download {len(url_list)} links in {end - start} seconds')


def aio_get(url_list):
    """
    异步非阻塞的方式发送请求
    这种情况下，并发可以非常搞，性能看带宽、网卡等情况，不过并发太高可能会被服务端限流
    :param url_list:
    :return:
    """
    import asyncio
    import aiohttp
    from aiohttp.client import ClientSession

    async def download_link(url: str, session: ClientSession):
        async with session.get(url) as response:
            result = await response.text()
            print(f'Read {len(result)} from {url}')

    async def download_all(urls: list):
        my_conn = aiohttp.TCPConnector(limit=40)
        async with aiohttp.ClientSession(connector=my_conn) as session:
            tasks = []
            for url in urls:
                task = asyncio.ensure_future(download_link(url=url, session=session))
                tasks.append(task)

            await asyncio.gather(*tasks, return_exceptions=True)  # the await must be nest inside of the session

    start = time.time()
    asyncio.run(download_all(url_list))
    end = time.time()

    print(f'download {len(url_list)} links in {end - start} seconds')


if __name__ == '__main__':
    _url_list = ["https://cn.bing.com/", "https://www.baidu.com/", "https://www.so.com/"] * 50
    # 方法1 同步方式请求
    # sync_get(_url_list)  # 58.78762245178223
    # 方法2 同步请求，但是共享 session
    # sync_get_share_session(_url_list)  # 32.01118564605713
    # 方法3 多线程请求
    # multi_thread_get(_url_list)  # 3.5877575874328613
    # 方法4 线程池
    thread_pool_get(_url_list)  # 3.5470234870910645
    # 方法5 异步非阻塞的方式发送请求
    aio_get(_url_list)  # 1.354555606842041

多线程示意图