python协程coroutine

从爬虫说起,这是一个同步方法

import time

def crawl_page(url):
    print('crawling {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    time.sleep(sleep_time)
    print('OK {}'.format(url))

def main(urls):
    for url in urls:
        crawl_page(url)

%time main(['url_1', 'url_2', 'url_3', 'url_4'])
crawling url_1
OK url_1
crawling url_2
OK url_2
crawling url_3
OK url_3
crawling url_4
OK url_4
Wall time: 10.1 s

nest_asyncio模块允许在已经有一个运行中的事件循环的环境(如 Jupyter Notebook)中使用asyncio.run()。

!pip install nest_asyncio
Collecting nest_asyncio
  Downloading https://files.pythonhosted.org/packages/a0/c4/c2971a3ba4c6103a3d10c4b0f24f461ddc027f0f09763220cf35ca1401b3/nest_asyncio-1.6.0-py3-none-any.whl
Installing collected packages: nest-asyncio
Successfully installed nest-asyncio-1.6.0


WARNING: Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'ReadTimeoutError("HTTPSConnectionPool(host='pypi.org', port=443): Read timed out. (read timeout=15)")': /simple/nest-asyncio/
WARNING: Retrying (Retry(total=3, connect=None, read=None, redirect=None, status=None)) after connection broken by 'ConnectTimeoutError(<pip._vendor.urllib3.connection.VerifiedHTTPSConnection object at 0x000001F1788907F0>, 'Connection to pypi.org timed out. (connect timeout=15)')': /simple/nest-asyncio/

这样,你就大概看懂了协程是怎么用的吧。不妨试着跑一下代码,欸,怎么还是 10 秒?

import asyncio
import nest_asyncio

# 允许在 Jupyter Notebook 中运行 asyncio.run()
nest_asyncio.apply()

async def crawl_page(url):
    print('crawling {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print('OK {}'.format(url))

async def main(urls):
    for url in urls:
        await crawl_page(url)

%time asyncio.run(main(['url_1', 'url_2', 'url_3', 'url_4']))
crawling url_1
OK url_1
crawling url_2
OK url_2
crawling url_3
OK url_3
crawling url_4
OK url_4
Wall time: 10.1 s

接下来要讲的协程中的一个重要概念,任务(Task)

import asyncio

async def crawl_page(url):
    print('crawling {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print('OK {}'.format(url))

async def main(urls):
    tasks = [asyncio.create_task(crawl_page(url)) for url in urls]
    for task in tasks:
        await task

%time asyncio.run(main(['url_1', 'url_2', 'url_3', 'url_4']))

crawling url_1
crawling url_2
crawling url_3
crawling url_4
OK url_1
OK url_2
OK url_3
OK url_4
Wall time: 4.02 s

还有另一种写法

import asyncio

async def crawl_page(url):
    print('crawling {}'.format(url))
    sleep_time = int(url.split('_')[-1])
    await asyncio.sleep(sleep_time)
    print('OK {}'.format(url))

async def main(urls):
    tasks = [asyncio.create_task(crawl_page(url)) for url in urls]
    # 使用*task 来解包
    await asyncio.gather(*tasks)

%time asyncio.run(main(['url_1', 'url_2', 'url_3', 'url_4']))

crawling url_1
crawling url_2
crawling url_3
crawling url_4
OK url_1
OK url_2
OK url_3
OK url_4
Wall time: 4.03 s

解密协程

import asyncio

async def worker_1():
    print('worker_1 start')
    await asyncio.sleep(1)
    print('worker_1 done')

async def worker_2():
    print('worker_2 start')
    await asyncio.sleep(2)
    print('worker_2 done')

async def main():
    print('before await')
    await worker_1()
    print('awaited worker_1')
    await worker_2()
    print('awaited worker_2')

%time asyncio.run(main())
before await
worker_1 start
worker_1 done
awaited worker_1
worker_2 start
worker_2 done
awaited worker_2
Wall time: 3.05 s
  • asyncio.run(main())执行,程序进入main()函数,事件循环开启,同时创建task1和task2任务等待运行,接着输出 ‘before await’。
  • await task1执行,事件调度器调度worker_1,worker_1输出 ‘worker_1 start’ 后因await asyncio.sleep(1)切出,接着调度worker_2,worker_2输出 ‘worker_2 start’ 后因await asyncio.sleep(2)切出,此过程时间极短,随后事件调度暂停。
  • 一秒钟后,worker_1的sleep完成,事件调度器将控制权传回task_1,输出 ‘worker_1 done’,task_1完成从事件循环退出,await task1完成,主任务输出 ‘awaited worker_1’,在await task2处继续等待。
  • 两秒钟后,worker_2的sleep完成,事件调度器将控制权传回task_2,输出 ‘worker_2 done’,task_2完成从事件循环退出,主任务输出 ‘awaited worker_2’,协程全任务结束,事件循环结束。
import asyncio

async def worker_1():
    print('worker_1 start')
    await asyncio.sleep(1)
    print('worker_1 done')

async def worker_2():
    print('worker_2 start')
    await asyncio.sleep(2)
    print('worker_2 done')

async def main():
    task1 = asyncio.create_task(worker_1())
    task2 = asyncio.create_task(worker_2())
    print('before await')
    await task1
    print('awaited worker_1')
    await task2
    print('awaited worker_2')

%time asyncio.run(main())

before await
worker_1 start
worker_2 start
worker_1 done
awaited worker_1
worker_2 done
awaited worker_2
Wall time: 2.03 s

如果我们想给某些协程任务限定运行时间,一旦超时就取消,又该怎么做呢

worker_1 正常运行,worker_2 运行中出现错误,worker_3 执行时间过长被我们 cancel 掉了,这些信息会全部体现在最终的返回结果 res 中。

import asyncio

async def worker_1():
    await asyncio.sleep(1)
    return 1

async def worker_2():
    await asyncio.sleep(2)
    return 2 / 0

async def worker_3():
    await asyncio.sleep(3)
    return 3

async def main():
    task_1 = asyncio.create_task(worker_1())
    task_2 = asyncio.create_task(worker_2())
    task_3 = asyncio.create_task(worker_3())

    await asyncio.sleep(2)
    task_3.cancel()

    res = await asyncio.gather(task_1, task_2, task_3, return_exceptions=True)
    print(res)

%time asyncio.run(main())

[1, ZeroDivisionError('division by zero'), CancelledError()]
Wall time: 2.01 s

协程实现生产者消费者

import asyncio
import random

async def consumer(queue, id):
    while True:
        val = await queue.get()
        print('{} get a val: {}'.format(id, val))
        await asyncio.sleep(1)

async def producer(queue, id):
    for i in range(5):
        val = random.randint(1, 10)
        await queue.put(val)
        print('{} put a val: {}'.format(id, val))
        await asyncio.sleep(1)

async def main():
    queue = asyncio.Queue()

    consumer_1 = asyncio.create_task(consumer(queue, 'consumer_1'))
    consumer_2 = asyncio.create_task(consumer(queue, 'consumer_2'))

    producer_1 = asyncio.create_task(producer(queue, 'producer_1'))
    producer_2 = asyncio.create_task(producer(queue, 'producer_2'))

    await asyncio.sleep(10)
    consumer_1.cancel()
    consumer_2.cancel()
    
    await asyncio.gather(consumer_1, consumer_2, producer_1, producer_2, return_exceptions=True)

%time asyncio.run(main())

producer_1 put a val: 8
producer_2 put a val: 10
consumer_1 get a val: 8
consumer_2 get a val: 10
producer_1 put a val: 5
producer_2 put a val: 1
consumer_2 get a val: 5
consumer_1 get a val: 1
producer_1 put a val: 6
producer_2 put a val: 6
consumer_1 get a val: 6
consumer_2 get a val: 6
producer_1 put a val: 10
producer_2 put a val: 6
consumer_2 get a val: 10
consumer_1 get a val: 6
producer_1 put a val: 9
producer_2 put a val: 9
consumer_1 get a val: 9
consumer_2 get a val: 9
Wall time: 10 s

crawler 豆瓣

!pip install aiohttp
Requirement already satisfied: aiohttp in d:\sdk\anaconda3\lib\site-packages (3.8.6)
Requirement already satisfied: asynctest==0.13.0; python_version < "3.8" in d:\sdk\anaconda3\lib\site-packages (from aiohttp) (0.13.0)
Requirement already satisfied: typing-extensions>=3.7.4; python_version < "3.8" in d:\sdk\anaconda3\lib\site-packages (from aiohttp) (3.7.4.3)
Requirement already satisfied: multidict<7.0,>=4.5 in d:\sdk\anaconda3\lib\site-packages (from aiohttp) (6.0.5)
Requirement already satisfied: charset-normalizer<4.0,>=2.0 in d:\sdk\anaconda3\lib\site-packages (from aiohttp) (3.3.2)
Requirement already satisfied: aiosignal>=1.1.2 in d:\sdk\anaconda3\lib\site-packages (from aiohttp) (1.3.1)
Requirement already satisfied: yarl<2.0,>=1.0 in d:\sdk\anaconda3\lib\site-packages (from aiohttp) (1.9.4)
Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in d:\sdk\anaconda3\lib\site-packages (from aiohttp) (4.0.3)
Requirement already satisfied: attrs>=17.3.0 in d:\sdk\anaconda3\lib\site-packages (from aiohttp) (19.1.0)
Requirement already satisfied: frozenlist>=1.1.1 in d:\sdk\anaconda3\lib\site-packages (from aiohttp) (1.3.3)
Requirement already satisfied: idna>=2.0 in d:\sdk\anaconda3\lib\site-packages (from yarl<2.0,>=1.0->aiohttp) (2.8)
import asyncio
import aiohttp
from bs4 import BeautifulSoup

async def fetch_content(url):
    async with aiohttp.ClientSession(headers=None, connector=aiohttp.TCPConnector(ssl=False)) as session:
        async with session.get(url) as response:
            return await response.text()

async def main():
    url = "https://movie.douban.com/cinema/later/beijing/"
    init_page = await fetch_content(url)
    init_soup = BeautifulSoup(init_page, 'html.parser')

    movie_names, urls_to_fetch, movie_dates = [], [], []

    all_movies = init_soup.find('div', id="showing-soon-list")
    if all_movies is None:
        return
    for each_movie in all_movies.find_all('div', class_="item"):
        all_a_tag = each_movie.find_all('a')
        all_li_tag = each_movie.find_all('li')

        movie_names.append(all_a_tag[1].text)
        urls_to_fetch.append(all_a_tag[1]['href'])
        movie_dates.append(all_li_tag[0].text)

    tasks = [fetch_content(url) for url in urls_to_fetch]
    pages = await asyncio.gather(*tasks)

    for movie_name, movie_date, page in zip(movie_names, movie_dates, pages):
        soup_item = BeautifulSoup(page, 'html.parser')
        img_tag = soup_item.find('img')
        if img_tag is not None:
            print('{} {} {}'.format(movie_name, movie_date, img_tag['src']))

%time asyncio.run(main())
Wall time: 1.38 s
  • 2
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

悟空学编程

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值