不对IPProxyPool源码进行褒贬,致敬开源精神,我根据自己的实际业务需要,对其进行稍作改在,同时解读他的设计思路。
我的python环境是python3,
1、ipproxy启动
查看IPProxy.py
中,这里有四个进程
# 提供rest api服务
p0 = Process(target=start_api_server)
# 代理爬取
p1 = Process(target=startProxyCrawl, args=(q1, DB_PROXY_NUM,myip))
# 教研ip代理有效性
p2 = Process(target=validator, args=(q1, q2, myip))
# 存储数据到mysql
p3 = Process(target=store_data, args=(q2, DB_PROXY_NUM))
# 存储数据到redis中
p4 = Process(target=startStoreRedis)
2、web框架
他用的是web.py,可以通过pip install web.py==0.40.dev0
进行安装
web.py相当轻量,整个api服务都在apiServer.py
中,这里增加了一个random方法,从代理池中随机获取一条信息
# coding:utf-8
'''
定义几个关键字,count type,protocol,country,area,
'''
import json
import sys
import web
import config
from db.DataStore import sqlhelper
from db.RedisDao import RedisDao
from util.LogHandler import LogHandler
logger = LogHandler('api_server')
urls = (
'/', 'select',
'/delete', 'delete',
'/random', 'random',
)
redisDao = RedisDao()
def start_api_server():
logger.info('start api server')
sys.argv.append('0.0.0.0:%s' % config.API_PORT)
app = web.application(urls, globals())
app.run()
class select(object):
def GET(self):
inputs = web.input()
json_result = json.dumps(sqlhelper.select(inputs.get('count', None), inputs))
return json_result
class delete(object):
params = {}
def GET(self):
inputs = web.input()
json_result = json.dumps(sqlhelper.delete(inputs))
return json_result
class random(object):
params = {}
def GET(self):
inputs = web.input()
types = inputs.get('types', 'http')
return redisDao.random_get_json(types = types)
if __name__ == '__main__':
sys.argv.append('0.0.0.0:8000')
app = web.application(urls, globals())
app.run()
3 代理爬取
3.1 获取对外IP
对外的IP,就是别人访问你的IP,获取比较简单,通过在浏览器中输入http://httpbin.org/ip
就可以获取
3.2 通过策略模式获取代理清单
大话设计模式Python实现-策略模式,策略模式很简单,java里面是一个接口多个实现类,而在python里面定义了一个抽象类就可以。
3.3 多线程检验代理有效性
这里涉及到了gevent的使用,下面是示例代码
spawns = []
for proxy in proxylist:
spawns.append(gevent.spawn(detect_from_db, self.myip, proxy, self.proxies))
if len(spawns) >= MAX_CHECK_CONCURRENT_PER_PROCESS:
gevent.joinall(spawns)
spawns= []
gevent.joinall(spawns)
如何检测代理,核心代码在Validator.py
中
ip = proxy['ip']
port = proxy['port']
proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
protocol, types, speed = getattr(sys.modules[__name__],config.CHECK_PROXY['function'])(selfip, proxies)
验证ip代理评分的方法根据speed计算,而speed是根据访问http://httpbin.org/get
的速度
def _checkHttpProxy(selfip, proxies, isHttp=True):
types = -1
speed = -1
if isHttp:
test_url = config.TEST_HTTP_HEADER
else:
test_url = config.TEST_HTTPS_HEADER
try:
start = time.time()
r = requests.get(url=test_url, headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies)
if r.ok:
speed = round(time.time() - start, 2)
content = json.loads(r.text)
headers = content['headers']
ip = content['origin']
proxy_connection = headers.get('Proxy-Connection', None)
if ',' in ip:
types = 2
elif proxy_connection:
types = 1
else:
types = 0
return True, types, speed
else:
return False, types, speed
except Exception as e:
return False, types, speed
3.4 从代理服务器中爬取ip
config.py
中parserList中设置代理服务器地址,免费的基本不靠谱,我是讲购买的代理IP提取API放在里面
3.5 将爬取的代理ip写入queue
讲有效的ip放入到queue中,用以其他进程存储
def validator(queue1, queue2, myip):
logger.info('start validator')
tasklist = []
proc_pool = {} # 所有进程列表
cntl_q = Queue() # 控制信息队列
while True:
if not cntl_q.empty():
# 处理已结束的进程
logger.debug('kill process')
try:
pid = cntl_q.get()
proc = proc_pool.pop(pid)
proc_ps = psutil.Process(pid)
proc_ps.kill()
proc_ps.wait()
except Exception as e:
pass
# print(e)
# print(" we are unable to kill pid:%s" % (pid))
try:
# proxy_dict = {'source':'crawl','data':proxy}
if len(proc_pool) >= config.MAX_CHECK_PROCESS:
time.sleep(config.CHECK_WATI_TIME)
continue
proxy = queue1.get()
tasklist.append(proxy)
# logger.debug(proxy)
if len(tasklist) >= config.MAX_CHECK_CONCURRENT_PER_PROCESS:
p = Process(target=process_start, args=(tasklist, myip, queue2, cntl_q))
p.start()
proc_pool[p.pid] = p
tasklist = []
except Exception as e:
if len(tasklist) > 0:
p = Process(target=process_start, args=(tasklist, myip, queue2, cntl_q))
p.start()
proc_pool[p.pid] = p
tasklist = []
4、将代理写入到mysql
通过mysql写入动态代理IP,结合评分机制很容易,故先将数据写入到myslq中
5、将代理写入到redis中
哨兵模式链接redis
REDIS_CONFIG = {
# 'DB_CONNECT_TYPE': 'redis', # 'pymongo'sqlalchemy;redis
# 'DB_CONNECT_STRING': 'redis://192.168.5.174:6379/8',
'DB_CONNECT_TYPE': 'redis-sentinel', # 'pymongo'sqlalchemy;redis
'SERVICE_NAME': 'mymaster',
'DB': 13,
'SENTINELS': [('192.168.5.172', 26379),('192.168.5.173', 26379),('192.168.5.174', 26379)]
}
def __init__(self, url=None):
db_connect_type = config.REDIS_CONFIG['DB_CONNECT_TYPE']
if db_connect_type == 'redis':
redis_url = url or config.REDIS_CONFIG['DB_CONNECT_STRING']
self.redis = Redis.from_url(url or redis_url, **config.REDIS_PARAMS)
else:
service_name = config.REDIS_CONFIG['SERVICE_NAME']
db = config.REDIS_CONFIG['DB']
sentinels = config.REDIS_CONFIG['SENTINELS']
sentinel = Sentinel(sentinels, **config.REDIS_PARAMS)
config.REDIS_PARAMS['db'] = db
self.redis = sentinel.master_for(service_name, **config.REDIS_PARAMS)
self.redis_prefix = 'eie_proxy_'
self.limit = 50 #redis存放ip代理个数
self.expire_time = 900 #redis过期时间