在middlewares.py中设置:
class ProxyMiddleware(object)
logger = logging.getLogger(__name__)
def process_request(self, request, spider):
self.logger.debug('Trying using proxy')
request.meta['proxy'] = 'http://' + self.proxy()
return None
def proxy(self):
try:
print('get proxy ...')
proxy = requests.get("http://127.0.0.1:5010/get").text
ip = {"http": "http://" + proxy, "https": "https://" + proxy}
r = requests.get("http://www.baidu.com", proxies=ip, timeout=4)
if r.status_code == 200:
return proxy
except:
print('get proxy again ...')
return self.proxy()
def process_exception(self, request, exception, spider):
self.logger.debug('Get exception')
request.meta['proxy'] = 'http://' + self.proxy()
return request
在spider.py中设置:
def make_requests_from_url(self, url):
return scrapy.Request(url=url, meta={‘download_timeout’: 5}, callback=self.parse, dont_filter=True)
在settings.py中设置:
DOWNLOADER_MIDDLEWARES = {
‘httpbintest.middlewares.ProxyMiddleware’: 300,
‘scrapy.downloadermiddlewares.retry.RetryMiddleware’: None,
}