同上节一样我们还是从commands模块开始,crawl命令的入口是crawl.py的run方法.下面将对run进行简单的介绍。
def run(self, args, opts):
if len(args) < 1:
raise UsageError()
elif len(args) > 1:
raise UsageError("running 'scrapy crawl' with more than one spider is no longer supported")
spname = args[0] #获取需要运行的工程名
self.crawler_process.crawl(spname, **opts.spargs) #调用crawler.py中CrawlerProcess的crawl方法
self.crawler_process.start() #调用crawler.py中CrawlerProcess的crawl方法
crawl.py的run方法主要使用了CrawlerProcess中的crawl以及start。
def crawl(self, crawler_or_spidercls, *args, **kwargs):
crawler = self.create_crawler(crawler_or_spidercls) # (A)根据爬虫工程名获取crawler
return self._crawl(crawler, *args, **kwargs) #(B) 调用self 的_crawl方法
(A)self.create_crawler(crawler_or_spidercls)主要的作用是根据spname创建一个Crawler,可以把Crawler理解成一个实际的爬虫,而CrawlerProcess的作用就是运行这个爬虫.
def create_crawler(self, crawler_or_spidercls)
if isinstance(crawler_or_spidercls, Crawler): #crawler_or_spidercls是一个Crawler直接返回crawler_or_spidercls
return crawler_or_spidercls
return self._create_crawler(crawler_or_spidercls) #(A1) 返回Crawler对象
def _create_crawler(self, spidercls):
if isinstance(spidercls, six.string_types):
spidercls = self.spider_loader.load(spidercls) #如果spidercls是String类型,通过则通过SpiderLoader的spider_loader方法获取spider对象
return Crawler(spidercls, self.settings) #创建Crawler对象
(B) create_crawler方法创建crawler后,通过_crawl去运行这个crawler。
def _crawl(self, crawler, *args, **kwargs):
self.crawlers.add(crawler) #添加crawLer到crawls
d = crawler.crawl(*args, **kwargs) #(C)运行crawler 的crawl方法
self._active.add(d)
def _done(result):
self.crawlers.discard(crawler)
self._active.discard(d)
return result
return d.addBoth(_done) #添加_done到回掉,crawl结束是运行_done
(C) 接下来说下下crawler 的crawl方法,在crawl中创建了spider以及engine引擎.然后调用引擎的启动方法.
@defer.inlineCallbacks
def crawl(self, *args, **kwargs):
assert not self.crawling, "Crawling already taking place"
self.crawling = True
try:
self.spider = self._create_spider(*args, **kwargs) # (C1) 创建spider
self.engine = self._create_engine() #(C2) 创建引擎
start_requests = iter(self.spider.start_requests()) #(C3)通过Spider获取初始URL
yield self.engine.open_spider(self.spider, start_requests) #(D)调用引擎 open_spider方法
yield defer.maybeDeferred(self.engine.start) #延迟调用引擎start方法
except Exception:
..............................................................................
(C1)(C2)通过spidercls的from_crawler方法创建Spider以及通过ExecutionEngine创建引擎engine。
def _create_spider(self, *args, **kwargs):
return self.spidercls.from_crawler(self, *args, **kwargs)
def _create_engine(self):
return ExecutionEngine(self, lambda _: self.stop())
下面将介绍下CrawlerProcess中的start方法.通过方法注释可以知道start启动 Twisted ‘reactor_’和安装DNS缓存。Twisted是用Python实现的基于事件驱动的网络引擎框架。想要了解更多可以考http://www.aosabook.org/en/twisted.html。
def start(self, stop_after_crawl=True):
"""
This method starts a Twisted `reactor`_, adjusts its pool size to
:setting:`REACTOR_THREADPOOL_MAXSIZE`, and installs a DNS cache based
on :setting:`DNSCACHE_ENABLED` and :setting:`DNSCACHE_SIZE`.
If `stop_after_crawl` is True, the reactor will be stopped after all
crawlers have finished, using :meth:`join`.
:param boolean stop_after_crawl: stop or not the reactor when all
crawlers have finished
"""
if stop_after_crawl:
d = self.join()
# Don't start the reactor if the deferreds are already fired
if d.called:
return
d.addBoth(self._stop_reactor)
reactor.installResolver(self._get_dns_resolver())
tp = reactor.getThreadPool()
tp.adjustPoolsize(maxthreads=self.settings.getint('REACTOR_THREADPOOL_MAXSIZE'))
reactor.addSystemEventTrigger('before', 'shutdown', self.stop)
reactor.run(installSignalHandlers=False) # blocking call
未完待续