简单粗暴些,利用scrapy 框架原理自定义middleware 处理状态码异常,ip 超时的异常,重现发送请求,
这里需要重写scrapy 内置的中间件 RetryMiddleware,
middlewares.py
class Process_Proxies(RetryMiddleware):
logger = logging.getLogger(__name__)
def dele_proxy(self,proxy,res=None):
print('删除代理')
if proxy:
gp = GetProxy()
gp.removeproxy(proxy)
def process_response(self, request, response, spider):
# if request.meta.get('dont_retry',False):
# return response
# if response.status in self.retry_http_codes:
if response.status != 200:
print('状态码异常')
reason = response_status_message(response.status)
self.dele_proxy(request.meta['proxy'],False)
time.sleep(random.randint(3,5))
return self._retry(request,reason,spider) or response
return response
def process_exception(self, request, exception, spider):
if isinstance(exception,self.EXCEPTIONS_TO_RETRY) and not request.meta.get('dont_retry',False):
self.dele_proxy(request.meta.get('proxy',False))
time.sleep(random.randint(3,5))
self.logger.warning('连接异常,进行重试......')
return self._retry(request,exception,spider)
setting.py
DOWNLOADER_MIDDLEWARES = {
'BaiduSpider.middlewares.UserAgentMiddleware': 100,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
"scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware": None,
'BaiduSpider.middlewares.Process_Proxies': 120,
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware':500
}
RETRY_TIMES = 10
RETRY_ENABLED: True