import requests
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from scrapy.utils.response import response_status_message
import base64
import logging
logger = logging.getLogger(__name__)
'''
AsyncProxyPool代理池中间件
从代理池随机获取一个代理,若代理失效重新请求并删除此代理
'''
class AsyncProxyPoolRetryMiddleware(RetryMiddleware):
def delete_proxy(self, proxy):
if proxy:
# 删除代理,对代理地址进行编码
key = base64.urlsafe_b64encode(proxy.encode()).decode()
url = "http://192.168.113.128:3289/delete/" + key
resp = requests.get(url).json()
logger.debug("删除代理:"+proxy+":"+str(resp))
def process_response(self, request, response, spider):
if request.meta.get('dont_retry', False):
return response
# 返回码不为200就重试
if str(response.status) != "200":
reason = response_status_message(response.status)
# 删除该代理
logger.debug("代理失效:"+str(reason))
self.delete_proxy(request.meta.get('proxy', False))
return self._retry(request, reason, spider) or response
return response
def process_exception(self, request, exception, spider):
if isinstance(exception, self.EXCEPTIONS_TO_RETRY) \
and not request.meta.get('dont_retry', False):
# 删除该代理
logger.debug("代理异常:" + str(exception))
self.delete_proxy(request.meta.get('proxy', False))
return self._retry(request, exception, spider)
class AsyncProxyPoolMiddleware(object):
def process_request(self, request, spider):
# 获取代理
url = requests.get("http://192.168.113.128:3289/pop").json()
proxy = list(url.values())[0]
logger.debug("获取到的代理:"+proxy)
# 设置代理
request.meta['proxy'] = proxy
async_proxy_pool/webapi_flask.py中增加
@app.route("/delete/<string:proxy>")
def delete_proxy(proxy):
return jsonify({"delete": redis_conn.delete_proxies(proxy)})
async_proxy_pool/database.py中增加
def delete_proxies(self, proxy):
key = str(proxy + '=' * (4 - len(proxy) % 4))
de = base64.urlsafe_b64decode(key)
proxy = de.decode()
print(proxy)
return self.redis.zrem(REDIS_KEY, proxy)