Tornado框架实现异步爬虫

from urllib.parse import urljoin

from bs4 import BeautifulSoup
from tornado import gen, httpclient, ioloop, queues

base_url = "http://www.tornadoweb.org/en/stable/"
concurrency = 20 #并发数

async def get_links(url):
    http_client = httpclient.AsyncHTTPClient()
    response = await http_client.fetch(url)
    html = response.body.decode("utf8")
    soup = BeautifulSoup(html)
    links = [urljoin(base_url, a.get("href")) for a in soup.find_all("a",href=True)]

    return links

async def main():
    seen_set = set()
    q = queues.Queue()

    async def fetch_url(current_url):
        if current_url in seen_set:
            return

        print("获取 {}".format(current_url))

        seen_set.add(current_url)
        next_urls = await get_links(current_url)
        for new_url in next_urls:
            if new_url.startswith(base_url):
                await q.put(new_url)

    async def worker():
        async for url in q:
            if url is None:
                return
            try:
                await fetch_url(url)
            except Exception as e:
                print(e)
                print("exec")
            finally:
                q.task_done()

    await q.put(base_url)
    workers = gen.multi([worker() for _ in range(concurrency)])
    await q.join()
    for _ in range(concurrency):
        await q.put(None)
    await workers

if __name__ == '__main__':
    loop = ioloop.IOLoop.current()
    loop.run_sync(main)

转载于:https://www.cnblogs.com/PyKK2019/p/tornado-spider.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值