fromtwisted.internet import deferfromtwisted.internet import reactorfromtwisted.web.client import getPagefromqueue import Queue
#定义的请求类,封装请求数据classRequest:
def __init__(self,url,callback):
self.url=url
self.callback=callback
#定义响应类,将获取的数据进行封装处理classHttpResponse:
def __init__(self,content,request):
self.content=content
self.request=request
self.url=request.url
self.text= str(content,encoding="utf-8")class Scheduler(object):"""任务调度器"""def __init__(self):
self.q=Queue()
def open(self):
pass
def next_request(self):try:
req= self.q.get(block=False)
except Exceptionase:
req=Nonereturnreq
def enqueue_request(self,req):
self.q.put(req)
def size(self):returnself.q.qsize()classExecutionEngine:'''引擎:负责所有调度'''def __init__(self):
self._close=None
self.scheduler=None
self.max= 5self.crawlling=[]
def get_response_callback(self,content,request):
self.crawlling.remove(request)
response=HttpResponse(content,request)
result=request.callback(response)
import typesifisinstance(result,types.GeneratorType):for req inresult:
self.scheduler.enqueue_request(req)
def _next_request(self):if self.scheduler.size() == 0 and len(self.crawlling) == 0:
self._close.callback(None)return
while len(self.crawlling)
req=self.scheduler.next_request()ifnot req:returnself.crawlling.append(req)
d= getPage(req.url.encode('utf-8'))
d.addCallback(self.get_response_callback,req)
d.addCallback(lambda _:reactor.callLater(0,self._next_request))
@defer.inlineCallbacks
def open_spider(self,start_requests):
self.scheduler=Scheduler()yield self.scheduler.open() #yieldNone只是为了不报错,因为在defer.inlineCallbacks需要当前函数是生成器whileTrue:try:
req=next(start_requests)
except StopIterationase:breakself.scheduler.enqueue_request(req)
reactor.callLater(0,self._next_request)
@defer.inlineCallbacks
def start(self):
self._close=defer.Deferred()yieldself._closeclassCrawler:'''用户封装调度器以及引擎'''def _create_engine(self):returnExecutionEngine()
def _create_spider(self,spider_cls_path):
module_path,cls_name= spider_cls_path.rsplit(".",maxsplit=1)
import importlib
m=importlib.import_module(module_path)
cls=getattr(m,cls_name)returncls()
@defer.inlineCallbacks
def crawl(self,spider_cls_path):
engine=self._create_engine()
print(engine) #每个爬虫创建一个引擎,一个调度器,去并发爬虫中的请求
spider=self._create_spider(spider_cls_path)
start_requests=iter(spider.start_requests())yieldengine.open_spider(start_requests)yieldengine.start()classCrawlerProcess:'''开启事件循环'''def __init__(self):
self._active= set()
def crawl(self,spider_cls_path):
crawler=Crawler()
d=crawler.crawl(spider_cls_path)
self._active.add(d)
def start(self):
dd=defer.DeferredList(self._active)
dd.addBoth(lambda _:reactor.stop())
reactor.run()classCommand:'''命令'''def run(self):
crawl_process=CrawlerProcess()
spider_cls_path_list= ['spider.chouti.ChoutiSpider','spider.baidu.BaiduSpider']for spider_cls_path inspider_cls_path_list:
crawl_process.crawl(spider_cls_path)
crawl_process.start()if __name__ == "__main__":
cmd=Command()
cmd.run()