一、为什么要构建代理池?
- 许多网站都有专门的反爬虫措施,可能会遇到封IP的问题
- 互联网上公开大量的免费代理资源,可以利用
- 通过定时的检测维护,同样可以得到好用的代理
二、代理池的要求
多站抓取、异步检测
定时筛选、持续更新
提供接口,易于获取
三、代理池的架构
获取模块
定时从各代理网站抓取,固定格式,之后将可用代理保存到数据库
存储模块
用redis的有序集合,要求代理不重复、标识代理可用情况、还要动态实时处理每个代理。该模块也是中心模块
redis有序集合,代理就是集合的元素,每个元素都有个分数字段,根据分数对集合排序,分数低的在左边,分数高的在右侧
分数的规则:
1)100分为可用,检测可用设为100,不可用分数减1,减到0后移除代理
2)新获取的代理分数为10,测试通过设为100
代码实现
检测模块
定时检测数据库中的代理,通过打分标识代理状态,为API提供好用的代理。
API
接口模块,flask来实现。随机返回某个可用代理的接口,实现负载均衡。
四、代理池的实现
存储模块
import redis
from proxypool.error import PoolEmptyErrror
from proxypool.setting import HOST,PORT,PASSWORD
class RedisClient(object):
def __init__(self,host=HOST,port=PORT):
if PASSWORD:
self._db = redis.Redis(host=host,port=port,password=PASSWORD)
else:
self._db = redis.Redis(host=HOST,port=PORT)```
def get(self,count=1):
'''
从左侧批量拿出代理,左侧为老化的代理,右侧更新的
'''
proxies = self._db.lrange("proxies",0,count-1)
# 对代理进行修剪trim,保留区间内的值
self._db.ltrim("proxies",count,-1)
return proxies
def put(self,proxy):
'''
向右侧添加元素
'''
self._db.rpush("proxies",proxy)
def pop(self):
'''
供API调用,从右侧弹出,返回并删除队列的尾元素
'''
try:
return self._db.rpop("proxies").decode('utf8')
except:
raise PoolEmptyError
@property
def queue_len(self):
'''
获取队列长度
'''
return self._db.llen("proxies")
def flush(self):
'''
刷新队列
'''
self._db.flushall()
if __name__=='__main__':
conn = RedisClient()
print(conn.pop())
检测调度
import aiohttp
import asyncio
from proxypool.db import RedisClient
from proxypool.error import ResourceDepletionError
from proxypool.getter import FreePorxyGetter
from proxypool.setting import *
import time
from multiprocessing import Process
try:
from aiohttp.errors import ProxyConnectionError,ServerDisconnectedError,ClientResponseError
except:
from aiohttp import ClientProxyConnectionError as PorxyConnectionError,ServerDisconnectedError,ClientResponseError,ClientConnectionError
class ValidityTester(object):
test_api = TEST_API
def __init__(self):
self._raw_proxies = None
self._usable_proxies = []
def set_raw_proxies(self,proxies):
self._raw_proxies = proxies
self._conn = RedisClient()
async def test_single_proxy(self,proxy):
try:
async with aiohttp.ClientSession() as session:
try:
if isinstance(proxy,bytes):
proxy = proxy.decode('utf8')
real_proxy = 'http://' + proxy
print('Testing', proxy)
async with session.get(self.test_api,proxy=real_proxy,timeout=get_proxy_timeout) as response:
if response.status==200:
self._conn.put(proxy)
print('Valid proxy',proxy)
except (aiohttp.ServerDisconnectedError,aiohttp.ClientResponseError,aiohttp.ClientConnectorError) as s:
print(s)
pass
def test(self):
'''
aio test all proxies
'''
print('ValidityTester is working')
try:
loop = asyncio.get_event_loop()
tasks = [self.test_single_proxy(proxy) for proxy in self._raw_proxies]
loop.run_unitl_complete(asyncio.wait(tasks))
except ValueError:
print('Async Error')
class PoolAdder(object):
def __init__(self,threshold):
self._threshold = threshold
self._conn = RedisClient()
self._tester = ValidityTester()
self._crawler = FreeProxyGetter()
def is_over_threshold(self):
'''
judge if count is overflow
'''
if self._conn.queue_len >= self,_threshold:
return True
else:
return False
def add_to_queue(self):
print('PoolAdder is working')
proxy_count = 0
while not self.is_over_threshold():
for callback_lable in range(self._crawler.__CrawlFuncCount__):
callback = self._crawler.__CrawlFunc__[callback_lable]
raw_proxies = self._crawler.get_raw_proxies(callback)
# test crawled proxies
self._tester.set_raw_proxies(raw_proxies)
self._tester.test()
proxy_count += len(raw_proxies)
if self.is_over_threshold():
print('IP is enough, waiting to be used')
break
if proxy_count == 0:
raise ResourceDepletionError
class Schedule(object):
@staticmethod
def valid_proxy(cycle=VALID_CHECK_CYCLE):
'''
Get half of proxies in redis
'''
conn = RedisClient()
tester = ValidityTester()
while True:
print('Refreshing IP')
count = int(0.5 * conn.queue_len)
if count == 0:
print('Waiting for adding')
time.sleep(cycle)
continue
raw_proxies = conn.get(count)
tester.set_raw_proxies(raw_proxies)
tester.test()
time.sleep(cycle)
@staticmethod
def check_pool(lower_threshold=POOL_LOWER_THRESHOLD,upper_threshold=POOL_UPPER_THRESHOLD,cycle=POOL_LEN_CHECK_CYCLE):
'''
If the number of proxies less than lower_threshold, add proxy
'''
conn = RedisClient()
adder = PoolAdder(upper_threshold)
while True:
if conn.queue_len < lower_threshold:
adder.add_to_queue()
time.sleep(cycle)
def run(self):
print('IP processing running')
valid_process = Process(target=Schedule.valid_proxy)
check_process = Process(target=Schedule.check_pool)
valid_process.start()
check_process.start()
获取模块
from .utils import get_page
from pyquery import PyQuery as pq
import re
class ProxyMetaclass(type):
'''
元类,在FreeProxyGetter类中加入__CrawlFunc__和__CrawlFuncCount__两个参数,分别表示爬虫函数和爬虫函数的数量
'''
def __new__(cls,name,bases,attrs):
count=0
attrs['__CrawlFunc__'] = []
for k,v in attrs.items():
if 'crawl_' in k:
attrs['__CrawlFunc__'].append(k)
count+=1
attrs['__CrawlFuncCount__'] = count
return type.__new__(cls,name,bases,attrs)
class FreeProxyGetter(object,metaclass=ProxyMetaclass):
def get_raw_proxies(self,callback):
proxies = []
print('Callback',callback)
for proxy in eval("self.{}()".format(callback)):
print('Getting', proxy, 'from', callback)
proxies.append(proxy)
return proxies
def crawl_kuaidaili(self):
for page in range(1,4):
start_url = 'https://www.kuaidaili.com/free/inha/{}/'
html = get_page(start_url)
ip_address = re.compile('<td data-title="IP">(.*)</td>\s*<td data-title="PORT">(\w+)</td>')
re_ip_address = ip_address.findall(str(html))
for address,port in re_ip_address:
result = address+":"+port
yield result.replace(' ','')
def crawl_xicidaili(self):
for page in range(1,4):
start_url = 'https://www.xicidaili.com/nn/{}'.format(page)
html = get_page(start_url)
ip_address = re.compile('<td class="country"><img src="http://fs.xicidaili.com/images/flag/cn.png" alt="Cn"></td>\s*<td>(.*?)</td>\s*<td>\s*<td>(.*?)</td>')
# \s* 匹配空格,换行
re_ip_address = ip_address.findall(str(html))
for address,port in re_ip_address:
result = address+":"+port
yield result.replace(' ','')
def crawl_daili66(self,page_count=4):
start_url = 'http://www.66ip.cn/{}.html'
urls = [start_url.format(page) for page in range(1,page_count+1)]
for url in urls:
print('Crawling',url)
html = get_page(url)
if html:
doc = pq(html)
trs = doc('.containerbox table tr:gt(0)').items()
for tr in trs:
ip = tr.find('td:nth-child(1)').text()
port = tr.find('td:nth-child(2)').text()
yield ':'.join([ip,port])
api.py
from flask import Flask,g
from .db import RedisClient
__all__ = ['app']
app = Flask(__name__)
def get_conn():
'''
open a new redis connection if there is none yet for the current application context
'''
if not hasattr(g,'redis_client'):
g.redis_client = RedisClient()
return g.redis_client
@app.route('/')
def index():
return '<h2>Welcome to my Proxy Pool</h2>'
@app.route('/get')
def get_proxy():
conn = get_conn()
return conn.pop()
@app.route('/count')
def get_counts():
conn = get_conn()
return str(conn.queue_len)
if __name__=='__main__':
app.run()
其余部分
from proxypool.api import app
from proxypool.schedule import Schedule
def main():
s = Schedule()
s.run()
app.run()
if __name__=='__main__':
main()
HOST = 'localhost'
PORT = 6379
PASSWORD = ''
# 获得代理测试时间界限
get_proxy_timeout = 9
# 代理池数量限制
POOL_LOWER_THRESHOLD = 20
POOL_UPPER_THRESHOLD = 100
# 检查周期
VALID_CHECK_CYCLE = 60
POOL_LEN_CHECK_CYCLE = 20
# 测试api
TEST_API = 'https://www.baidu.com'
class ResourceDepletionError(Exception):
def __init__(self):
Exception.__init__(self)
def __str__(self):
return repr('The proxy source is exhaused')
class PoolEmptyError(Exception):
def __init__(self):
Exception.__init__(self)
def __str__(self):
return repr('The proxy pool is empty')