from scrapy.downloadermiddlewares.retry import RetryMiddleware, response_status_message
import logging
from twisted.internet import defer
from twisted.internet.error import TimeoutError, DNSLookupError, \
ConnectionRefusedError, ConnectionDone, ConnectError, \
ConnectionLost, TCPTimedOutError
from urllib3.exceptions import ProtocolError, ProxyError, ProxySchemeUnknown
from twisted.web.client import ResponseFailed
from scrapy.core.downloader.handlers.http11 import TunnelError
from versace import settings
class MyRetryMiddleware(RetryMiddleware):
logger = logging.getLogger(__name__)
EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError,
ConnectionRefusedError, ConnectionDone, ConnectError,
ConnectionLost, TCPTimedOutError, ResponseFailed,
IOError, TunnelError, ProtocolError, ProxyError, ProxySchemeUnknown)
proxy_list = []
lock = Lock()
def get_proxy_ip(self, num=10, time=1):
self.lock.acquire()
if not self.proxy_list:
print('ip池为空,重新获取。。。。')
res_json = requests.get(url=settings.PROXY_IP_API.format(num, time)).json()
data_list = res_json.get('data')
self.proxy_list = ['https://' + i.get('ip') + ':' + str(i.get('port')) for i in data_list]
print(self.proxy_list)
proxy_ip = self.proxy_list.pop(0) if self.proxy_list else ''
self.lock.release()
if proxy_ip:
self.proxy_list.append(proxy_ip)
return proxy_ip
def delete_proxy(self, proxy):
if proxy in self.proxy_list:
self.proxy_list.remove(proxy)
def process_request(self, request, spider):
if 'jd' in spider.name:
proxy_ip = request.meta.get('proxy')
if not proxy_ip:
proxy_ip = self.get_proxy_ip(num=settings.CONCURRENT_REQUESTS * 2, time=2)
request.meta['proxy'] = proxy_ip
request.headers['Referer'] = 'https://www.jd.com'
if proxy_ip not in self.proxy_list:
self.proxy_list.append(proxy_ip)
def process_response(self, request, response, spider):
if 'jd' in spider.name:
proxy_ip = request.meta.get('proxy')
if not response.body:
print(response.body)
self.logger.info('ip被屏蔽,更换代理IP...')
print(proxy_ip)
self.delete_proxy(proxy_ip)
proxy_ip = self.get_proxy_ip(num=settings.CONCURRENT_REQUESTS * 2, time=2)
request.meta['proxy'] = proxy_ip
request.headers['Referer'] = 'https://www.jd.com'
if proxy_ip not in self.proxy_list:
self.proxy_list.append(proxy_ip)
return request
if request.meta.get('dont_retry', False):
return response
if response.status in self.retry_http_codes:
reason = response_status_message(response.status)
self.delete_proxy(request.meta.get('proxy', False))
self.logger.info('返回值异常, 更换代理IP进行重试...')
return self._retry(request, reason, spider) or response
else:
if response.status in self.retry_http_codes:
reason = response_status_message(response.status)
return self._retry(request, reason, spider) or response
return response
def process_exception(self, request, exception, spider):
if isinstance(exception, self.EXCEPTIONS_TO_RETRY) \
and not request.meta.get('dont_retry', False):
if 'jd' in spider.name:
self.delete_proxy(request.meta.get('proxy', False))
proxy_ip = self.get_proxy_ip(num=settings.CONCURRENT_REQUESTS * 2, time=2)
request.meta['proxy'] = proxy_ip
self.logger.info('连接异常, 更换代理IP进行重试...')
return self._retry(request, exception, spider)
DOWNLOADER_MIDDLEWARES = {
'versace.middlewares.VersaceDownloaderMiddleware': 543,
"scrapy.downloadermiddlewares.retry.RetryMiddleware": None,
"versace.middlewares.MyRetryMiddleware": 544
}