基于协程的异步爬虫

本文介绍了一个利用Tornado框架构建的异步爬虫示例,探讨了如何通过协程提高爬虫的效率和并发能力。
摘要由CSDN通过智能技术生成

基于tornado框架的异步爬虫小例子:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# created by fhqplzj on 2017/07/19 下午5:48
import logging
import time
from datetime import timedelta
from urlparse import urljoin, urldefrag

from scrapy import Selector
from tornado.gen import coroutine, Return
from tornado.httpclient import AsyncHTTPClient
from tornado.ioloop import IOLoop
from tornado.queues import Queue

logging.basicConfig()
base_url = 'http://www.tornadoweb.org/en/stable/'
concurrency = 10


@coroutine
def get_links_from_url(url):
    try:
        response = yield AsyncHTTPClient().fetch(url)
        print 'fetched %s' % url
        html = response.body if isinstance(response.body, str) else response.body.decode()
        urls = [urljoin(url, urldefrag(new_url)[0]) for new_url in get_links(html)]
    except Exception as e:
        print 'Exception: %s %s' % (e, url)
        raise Return([])
    raise Return(urls)


def get_links(html):
    return Selector(text=html).xpath('//a/@href').extract()


@coroutine
def main():
    q = Queue()
    start = time.time()
    # fetching: 已经抓的和正在抓的
    # fetched:  已经抓的
    fetching, fetched = set(), set()

    @coroutine
    def fetch_url():
        current_url = yield q.get()
        try:
            if current_url in fetching:
                return
            print 'fetching %s' % current_url
            fetching.add(current_url)
            urls = yield get_links_from_url(current_url)
            fetched.add(current_url)
            for new_url in urls:
                if new_url.startswith(base_url):
                    yield q.put(new_url)
        finally:
            q.task_done()

    @coroutine
    def worker():
        while True:
            yield fetch_url()

    q.put(base_url)
    for _ in range(concurrency):
        worker()
    yield q.join(timeout=timedelta(seconds=300))
    assert fetching == fetched
    print 'Done in %d seconds, fetched %s URLs.' % (time.time() - start, len(fetched))


if __name__ == '__main__':
    IOLoop.current().run_sync(main)


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值