pyspider 异步机制

最新推荐文章于 2023-01-09 13:53:35 发布

jingxindeyi

最新推荐文章于 2023-01-09 13:53:35 发布

阅读量657

点赞数

文章标签： tornado 异步

本文链接：https://blog.csdn.net/zslngu/article/details/88548850

版权

pyspider 的异步抓取是如何实现的：

首先看调用流程,

 # pyspider\run.py
 # line: 229
def fetcher(ctx, xmlrpc, xmlrpc_host, xmlrpc_port, poolsize, proxy, user_agent,
            timeout, phantomjs_endpoint, splash_endpoint, fetcher_cls,
            async=True, get_object=False, no_input=False):
    """
    Run Fetcher.
    """
    g = ctx.obj
    Fetcher = load_cls(None, None, fetcher_cls)

    if no_input:
        inqueue = None
        outqueue = None
    else:
        inqueue = g.scheduler2fetcher   # schduler 存入task的队列
        outqueue = g.fetcher2processor  # 抓取结果队列
    fetcher = Fetcher(inqueue=inqueue, outqueue=outqueue,
                      poolsize=poolsize, proxy=proxy, async=async)
    fetcher.phantomjs_proxy = phantomjs_endpoint or g.phantomjs_proxy
    fetcher.splash_endpoint = splash_endpoint
    if user_agent:
        fetcher.user_agent = user_agent
    if timeout:
        fetcher.default_options = copy.deepcopy(fetcher.default_options)
        fetcher.default_options['timeout'] = timeout

    g.instances.append(fetcher)
    if g.get('testing_mode') or get_object:
        return fetcher

    if xmlrpc:
        utils.run_in_thread(fetcher.xmlrpc_run, port=xmlrpc_port, bind=xmlrpc_host)
    fetcher.run()   # 开启fetcher

负责抓取的部分为

pyspider\fetcher\tordado_fetcher.py

# line 636 fetcher的run函数
def run(self):
        '''Run loop'''
        logger.info("fetcher starting...")

        def queue_loop():
            if not self.outqueue or not self.inqueue:
                return
            while not self._quit:
                try:
                    if self.outqueue.full():
                        break
                    if self.http_client.free_size() <= 0:
                        break
                    task = self.inqueue.get_nowait()  # 取task
                    # FIXME: decode unicode_obj should used after data selete from
                    # database, it's used here for performance
                    task = utils.decode_unicode_obj(task)
                    self.fetch(task)
                except queue.Empty:
                    break
                except KeyboardInterrupt:
                    break
                except Exception as e:
                    logger.exception(e)
                    break

        tornado.ioloop.PeriodicCallback(queue_loop, 100, io_loop=self.ioloop).start()  #  循环取任务
        tornado.ioloop.PeriodicCallback(self.clear_robot_txt_cache, 10000, io_loop=self.ioloop).start()
        self._running = True

        try:
            self.ioloop.start()
        except KeyboardInterrupt:
            pass

        logger.info("fetcher exiting...")

整个run部分大概流程：

定义queue_loop函数负责从task queue取任务，并检查边界情况，函数中通过fetch实现具体的抓取和解析流程
通过tornado ioloop periodiccallback 实现循环取任务

fetch 函数中调用 async fetch

    def fetch(self, task, callback=None):
        if self.async:
            return self.async_fetch(task, callback)
        else:
            return self.async_fetch(task, callback).result()

async_fetch

 @gen.coroutine
    def async_fetch(self, task, callback=None):
        '''Do one fetch'''
        url = task.get('url', 'data:,')
        if callback is None:
            callback = self.send_result

        type = 'None'
        start_time = time.time()
        try:
            if url.startswith('data:'):
                type = 'data'
                result = yield gen.maybe_future(self.data_fetch(url, task))
            elif task.get('fetch', {}).get('fetch_type') in ('js', 'phantomjs'):
                type = 'phantomjs'
                result = yield self.phantomjs_fetch(url, task)
            elif task.get('fetch', {}).get('fetch_type') in ('splash', ):
                type = 'splash'
                result = yield self.splash_fetch(url, task)
            else:
                type = 'http'
                result = yield self.http_fetch(url, task)
        except Exception as e:
            logger.exception(e)
            result = self.handle_error(type, url, task, start_time, e)

        callback(type, task, result)
        self.on_result(type, task, result)
        raise gen.Return(result)

1）根据不同的task type 调用不同的具体抓取函数这里我们看 http_fetch
2）对结果进行解析，处理

这里的异步机制和平常用到的tornado 异步爬虫不太一样，一般情况下的代码是这样的

# coding=utf-8
"""
tornado异步爬虫示例

"""
import time
from datetime import timedelta

try:
    from HTMLParser import HTMLParser
    from urlparse import urljoin, urldefrag
except ImportError:
    from html.parser import HTMLParser
    from urllib.parse import urljoin, urldefrag

from tornado import httpclient, gen, ioloop, queues

base_url = 'http://www.tornadoweb.org/en/stable/'
concurrency = 10


@gen.coroutine
def get_links_from_url(url):
    """Download the page at `url` and parse it for links.

    Returned links have had the fragment after `#` removed, and have been made
    absolute so, e.g. the URL 'gen.html#tornado.gen.coroutine' becomes
    'http://www.tornadoweb.org/en/stable/gen.html'.
    """
    try:
        response = yield httpclient.AsyncHTTPClient().fetch(url)
        print('fetched %s' % url)

        html = response.body if isinstance(response.body, str) \
            else response.body.decode()
        urls = [urljoin(url, remove_fragment(new_url))
                for new_url in get_links(html)]
    except Exception as e:
        print('Exception: %s %s' % (e, url))
        raise gen.Return([])

    raise gen.Return(urls)


def remove_fragment(url):
    pure_url, frag = urldefrag(url)
    return pure_url


def get_links(html):
    class URLSeeker(HTMLParser):
        def __init__(self):
            HTMLParser.__init__(self)
            self.urls = []

        def handle_starttag(self, tag, attrs):
            href = dict(attrs).get('href')
            if href and tag == 'a':
                self.urls.append(href)

    url_seeker = URLSeeker()
    url_seeker.feed(html)
    return url_seeker.urls


@gen.coroutine
def main():
    q = queues.Queue()
    start = time.time()
    fetching, fetched = set(), set()

    @gen.coroutine
    def fetch_url():
        current_url = yield q.get()
        try:
            if current_url in fetching:
                return

            print('fetching %s' % current_url)
            fetching.add(current_url)
            urls = yield get_links_from_url(current_url)
            fetched.add(current_url)

            for new_url in urls:
                # Only follow links beneath the base URL
                if new_url.startswith(base_url):
                    yield q.put(new_url)

        finally:
            q.task_done()

    @gen.coroutine
    def worker():
        while True:
            yield fetch_url()

    q.put(base_url)

    # Start workers, then wait for the work queue to be empty.
    for _ in range(concurrency):
        worker()
    yield q.join(timeout=timedelta(seconds=300))
    assert fetching == fetched
    print('Done in %d seconds, fetched %s URLs.' % (
        time.time() - start, len(fetched)))


if __name__ == '__main__':
    import logging
    logging.basicConfig()
    io_loop = ioloop.IOLoop.current()
    io_loop.run_sync(main)

从实际抓取的函数向上调用每一层均需要@gen.coroutine修饰，最后通过 io_loop.run_async(main) 开启抓取。但是在pyspider 中通过
tornado.ioloop.PeriodicCallback实现

如果官方文档的描述，当前调用如果没有结束是不会开始下次调用的

If the callback runs for longer than callback_time milliseconds, subsequent invocations will be skipped to get back on schedule.

但实际上，如果在periodiccallback中有异步的调用

Basically if a callback takes more then callback_time to execute subsequent invocations are only skipped if the callback is synchronous. If the callback calls another asynchronous routine PeriodicCallback has no way of knowing all will start the another invocation of the callback.

参照