可以在下载中间件直接处理 404 503 等等异常
settings.py
DOWNLOADER_MIDDLEWARES = {
#定义成200 . 让其他默认中间件先处理
'test_scrapy.middlewares.ProcessAllException': 200,
}
from scrapy.http import HtmlResponse
from scrapy.exceptions import IgnoreRequest
class ProcessAllException(object):
def process_response(self, request, response, spider):
#处理异常
if response.status >= 400 or hasattr(response,'exception'):
print('process_response :' , response.status, response)
if hasattr(response,'exception'):
print('exception:' , response.exception)
raise IgnoreRequest(response)
return response
def process_exception(self, request, exception, spider):
status_code = 400
#构造一个假的response , 传递给process_response
obj = HtmlResponse(url&