async for 在爬虫中的使用例子

import asyncio

import re

import typing

from concurrent.futures import Executor, ThreadPoolExecutor

from urllib.request import urlopen

DEFAULT_EXECUTOR = ThreadPoolExecutor(4)
ANCHOR_TAG_PATTERN = re.compile(b"<a.+?href=[\"|\'](.*?)[\"|\'].*?>", re.RegexFlag.MULTILINE | re.RegexFlag.IGNORECASE)


async def wrap_async(generator: typing.Generator,
                     executor: Executor = DEFAULT_EXECUTOR,
                     sentinel=None,
                     *,
                     loop: asyncio.AbstractEventLoop = None):
    """
    We wrap a generator and return an asynchronous generator instead
    :param iterator:
    :param executor:
    :param sentinel:
    :param loop:
    :return:
    """

    if not loop:
        loop = asyncio.get_running_loop()

    while True:
        # 相当于执行next(generator)
        result = await loop.run_in_executor(executor, next, generator, sentinel)
        if result == sentinel:
            # 如果链接为空跳出
            break
        yield result


def follow(*links):
    """
    :param links:
    :return:
    """

    return ((link, urlopen(link).read()) for link in links)


def get_links(text: str):
    """
    Get back an iterator that gets us all the links in a text iteratively and safely
    :param text:
    :return:
    """

    # Always grab the last match, because that is how a smart http parser would interpret a malformed
    # anchor tag
    return (match.groups()[-1]
            for match in ANCHOR_TAG_PATTERN.finditer(text)
            # This portion is a safeguard against None matches and zero href matches
            if hasattr(match, "groups") and len(match.groups()))


async def main(*links):
    async for current, body in wrap_async(follow(*links)):
        print("Current url:", current)
        print("Content:", body)
        async for link in wrap_async(get_links(body)):
            print(link)


asyncio.run(main("https://www.cnblogs.com/c-x-a"))

转载于:https://www.cnblogs.com/c-x-a/p/11028456.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值