python twisted应用_python---twisted的使用,使用其模拟Scrapy

fromtwisted.internet import deferfromtwisted.internet import reactorfromtwisted.web.client import getPagefromqueue import Queue

#定义的请求类,封装请求数据classRequest:

def __init__(self,url,callback):

self.url=url

self.callback=callback

#定义响应类,将获取的数据进行封装处理classHttpResponse:

def __init__(self,content,request):

self.content=content

self.request=request

self.url=request.url

self.text= str(content,encoding="utf-8")class Scheduler(object):"""任务调度器"""def __init__(self):

self.q=Queue()

def open(self):

pass

def next_request(self):try:

req= self.q.get(block=False)

except Exceptionase:

req=Nonereturnreq

def enqueue_request(self,req):

self.q.put(req)

def size(self):returnself.q.qsize()classExecutionEngine:'''引擎:负责所有调度'''def __init__(self):

self._close=None

self.scheduler=None

self.max= 5self.crawlling=[]

def get_response_callback(self,content,request):

self.crawlling.remove(request)

response=HttpResponse(content,request)

result=request.callback(response)

import typesifisinstance(result,types.GeneratorType):for req inresult:

self.scheduler.enqueue_request(req)

def _next_request(self):if self.scheduler.size() == 0 and len(self.crawlling) == 0:

self._close.callback(None)return

while len(self.crawlling)

req=self.scheduler.next_request()ifnot req:returnself.crawlling.append(req)

d= getPage(req.url.encode('utf-8'))

d.addCallback(self.get_response_callback,req)

d.addCallback(lambda _:reactor.callLater(0,self._next_request))

@defer.inlineCallbacks

def open_spider(self,start_requests):

self.scheduler=Scheduler()yield self.scheduler.open() #yieldNone只是为了不报错,因为在defer.inlineCallbacks需要当前函数是生成器whileTrue:try:

req=next(start_requests)

except StopIterationase:breakself.scheduler.enqueue_request(req)

reactor.callLater(0,self._next_request)

@defer.inlineCallbacks

def start(self):

self._close=defer.Deferred()yieldself._closeclassCrawler:'''用户封装调度器以及引擎'''def _create_engine(self):returnExecutionEngine()

def _create_spider(self,spider_cls_path):

module_path,cls_name= spider_cls_path.rsplit(".",maxsplit=1)

import importlib

m=importlib.import_module(module_path)

cls=getattr(m,cls_name)returncls()

@defer.inlineCallbacks

def crawl(self,spider_cls_path):

engine=self._create_engine()

print(engine) #每个爬虫创建一个引擎,一个调度器,去并发爬虫中的请求

spider=self._create_spider(spider_cls_path)

start_requests=iter(spider.start_requests())yieldengine.open_spider(start_requests)yieldengine.start()classCrawlerProcess:'''开启事件循环'''def __init__(self):

self._active= set()

def crawl(self,spider_cls_path):

crawler=Crawler()

d=crawler.crawl(spider_cls_path)

self._active.add(d)

def start(self):

dd=defer.DeferredList(self._active)

dd.addBoth(lambda _:reactor.stop())

reactor.run()classCommand:'''命令'''def run(self):

crawl_process=CrawlerProcess()

spider_cls_path_list= ['spider.chouti.ChoutiSpider','spider.baidu.BaiduSpider']for spider_cls_path inspider_cls_path_list:

crawl_process.crawl(spider_cls_path)

crawl_process.start()if __name__ == "__main__":

cmd=Command()

cmd.run()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值