1.twisted 的异步基于Deferreds对象
使用twisted 编写异步代码时要先导入defer 和reactor
from twisted.internet import defer, reactor
from twisted.web.client import getPage
from twisted.internet import reactor, defer
from twisted.web.client import getPage
def print_(n):
"""
将返回的数据解码
"""
print(n.decode('UTF-8'))
@defer.inlineCallbacks # 这个装饰器方便我们返回多个Deferred对象
def func(n):
print('%s开始' % n)
# 使用twisted 里的函数发送网络请求, 返回的是Deferred对象
s = getPage('http://www.baidu.com'.encode('UTF-8'))
# 添加回调函数
s.addCallback(print_)
print('休眠%s秒' % n)
print('%s结束' % n)
print('**********************\n')
yield s
def down(arg):
"""
关闭事件循环reactor
"""
reactor.stop()
if __name__ == '__main__':
# 调用func函数并得到Deferred对象
lis = [func(i) for i in range(3)]
# 将Deferred对象放入DeferredList
d = defer.DeferredList(lis)
# 当DeferredList里的回调函数执行完了 就执行down函数
d.addBoth(down)
# 启动reactor
reactor.run()
2.用twisted写一个超简单的异步爬虫
import queue
from twisted.internet import defer #defer.Deferred(特殊的socket对象),不会发请求
from twisted.internet import reactor #socket对象(如果下载完成...
from twisted.web.client import getPage #事件循坏
class Request:
def __init__(self, url, callback):
self.url = url
self.callback = callback
class HttpResponse:
def __init__(self, content, request):
self.content = content
self.request = request
self.url = request.url
self.text = str(content.decode('UTF-8'))
class XueXiSpider:
name = 'xue_xi'
def start_request(self):
start_url = ['http://www.baidu.com', 'http://www.bing.com', ]
for url in start_url:
yield Request(url, self.parse)
def parse(self, response):
print(response.url)
yield Request('http://www.cnblogs.com', callback=self.parse)
schedule = queue.Queue()
class Engine:
def __init__(self):
# 用来关闭deferred对象
self._close = None
# 最大并发数
self.max = 5
# 存放要下载的request用来限制并发数
self.crawling = []
def get_response_callback(self, content, request):
"""
调用request的callback函数解析下载的内容并将返回的request对象压入调度器
:param content: 下载内容
:param request: request对象
:return:
"""
# 移除下载完的request
self.crawling.remove(request)
# 生成response对象
response = HttpResponse(content, request)
# 调用parse解析response
reqs = request.callback(response)
import types
# 返回的是生成器对象就迭代它
if isinstance(reqs, types.GeneratorType):
for req in reqs:
# 加入调度器
schedule.put(req)
def _next_request(self):
"""
不断从队列中获取request, 当符合条件时关闭deferred对象
:return:
"""
# 判断是否要关闭deferred, defer.Deferred()对象需要手动关闭
if len(self.crawling) == 0 and schedule.qsize() == 0:
self._close.callback(None)
return
# 限制并发, crawling里的request属相大于5就直接返回, 不在下载
if len(self.crawling) > self.max:
return
while len(self.crawling) < self.max:
try:
# 从调度器中取出request, block=False以不阻塞的方式
req = schedule.get(block=False)
# 将要下载的request加入下载列表中
self.crawling.append(req)
# 调用twisted的getPage 来发起url请求, getPage返回的对象在回调结束后会自动移除
d = getPage(req.url.encode('UTF-8'))
# 添加解析response回调
d.addCallback(self.get_response_callback, req)
# 数量不足添加next_request回调继续获取request下载
d.addCallback(lambda _: reactor.callLater(0, self._next_request))
except Exception as e:
print(e)
return
@defer.inlineCallbacks
def crawl(self, spider_):
"""
将初始request压入队列, 定时启动self._next_request
:param spider_: spider对象
:return: deferred对象
"""
# 获取初始request
start_requests = iter(spider_.start_request())
while True:
try:
# 加入队列
schedule.put(next(start_requests))
except StopIteration:
break
# 调用crawl函数后不会立刻执行self._next_request, 而是要等到reactor.run()调用后才会执行self._next_request
# 0秒后调用self._next_request
reactor.callLater(0, self._next_request)
# 创建不会自动结束的defer.Deferred()对象 赋值给self._close, 在队列和下载列表为空时结束defer.Deferred()对象
self._close = defer.Deferred()
yield self._close
if __name__ == '__main__':
# 创建集合存放deferred对象
_active = set()
# 生成spider对象
spider = XueXiSpider()
# 创建引擎
engine = Engine()
# 调用crawl方法 返回deferred对象
d = engine.crawl(spider)
_active.add(d)
dd = defer.DeferredList(_active)
# 添加结束回调, 当self._close.callback(None) 后停止reactor
dd.addBoth(lambda a: reactor.stop())
# 启动主事件循环
reactor.run()
# 注意上述代码中deferred对象未添加addErrorback()错误处理函数, 一旦deferred回调函数出错, 程序不会报错, 会一直卡在那里
# lambda 函数要加入一个参数, 不能不填参数