代理池的构建

最新推荐文章于 2024-09-18 18:06:06 发布

慢慢慢时光

最新推荐文章于 2024-09-18 18:06:06 发布

阅读量1k

点赞数

分类专栏：爬虫

本文链接：https://blog.csdn.net/wjl31802/article/details/84191815

版权

爬虫专栏收录该内容

8 篇文章 0 订阅

订阅专栏

一、为什么要构建代理池？

许多网站都有专门的反爬虫措施，可能会遇到封IP的问题
互联网上公开大量的免费代理资源，可以利用
通过定时的检测维护，同样可以得到好用的代理

二、代理池的要求

多站抓取、异步检测
定时筛选、持续更新
提供接口，易于获取

三、代理池的架构

获取模块

定时从各代理网站抓取，固定格式，之后将可用代理保存到数据库

存储模块

用redis的有序集合，要求代理不重复、标识代理可用情况、还要动态实时处理每个代理。该模块也是中心模块
redis有序集合，代理就是集合的元素，每个元素都有个分数字段，根据分数对集合排序，分数低的在左边，分数高的在右侧
分数的规则：
1）100分为可用，检测可用设为100，不可用分数减1，减到0后移除代理
2）新获取的代理分数为10，测试通过设为100
代码实现

检测模块

定时检测数据库中的代理，通过打分标识代理状态，为API提供好用的代理。

API

接口模块，flask来实现。随机返回某个可用代理的接口，实现负载均衡。

四、代理池的实现

存储模块

db.py

import redis
from proxypool.error import PoolEmptyErrror
from proxypool.setting import HOST,PORT,PASSWORD

class RedisClient(object):
	def __init__(self,host=HOST,port=PORT):
		if PASSWORD:
			self._db = redis.Redis(host=host,port=port,password=PASSWORD)
		else:
			self._db = redis.Redis(host=HOST,port=PORT)```

	def get(self,count=1):
		'''
		从左侧批量拿出代理，左侧为老化的代理，右侧更新的
		'''
		proxies = self._db.lrange("proxies",0,count-1)
		# 对代理进行修剪trim，保留区间内的值
		self._db.ltrim("proxies",count,-1)
		return proxies

	def put(self,proxy):
		'''
		向右侧添加元素
		'''
		self._db.rpush("proxies",proxy)

	def pop(self):
		'''
		供API调用，从右侧弹出，返回并删除队列的尾元素
		'''
		try:
			return self._db.rpop("proxies").decode('utf8')
		except:
			raise PoolEmptyError
		
		@property
		def queue_len(self):
			'''
			获取队列长度
			'''
			return self._db.llen("proxies")
	
		def flush(self):
			'''
			刷新队列
			'''
			self._db.flushall()
		
	if __name__=='__main__':
		conn = RedisClient()
		print(conn.pop())

检测调度

schedule.py

import aiohttp
import asyncio
from proxypool.db import RedisClient
from proxypool.error import ResourceDepletionError
from proxypool.getter import FreePorxyGetter
from proxypool.setting import *
import time
from multiprocessing import Process
try:
	from aiohttp.errors import ProxyConnectionError,ServerDisconnectedError,ClientResponseError
except:
	from aiohttp import ClientProxyConnectionError as PorxyConnectionError,ServerDisconnectedError,ClientResponseError,ClientConnectionError

class ValidityTester(object):
	test_api = TEST_API
	
	def __init__(self):
		self._raw_proxies = None
		self._usable_proxies = []

	def set_raw_proxies(self,proxies):
		self._raw_proxies = proxies
		self._conn = RedisClient()

	async def test_single_proxy(self,proxy):
		try:
			async with aiohttp.ClientSession() as session:
				try:
				 	if isinstance(proxy,bytes):
				 		proxy = proxy.decode('utf8')
				 	real_proxy = 'http://' + proxy
				 	print('Testing', proxy)
				 	async with session.get(self.test_api,proxy=real_proxy,timeout=get_proxy_timeout) as response:
				 		if response.status==200:
				 			self._conn.put(proxy)
				 			print('Valid proxy',proxy)
				 except (aiohttp.ServerDisconnectedError,aiohttp.ClientResponseError,aiohttp.ClientConnectorError) as s:
				 	print(s)
				 	pass
			
	def test(self):
	'''
	aio test all proxies
	'''	 	
	print('ValidityTester is working')
	try:
		loop = asyncio.get_event_loop()
		tasks = [self.test_single_proxy(proxy) for proxy in self._raw_proxies]
		loop.run_unitl_complete(asyncio.wait(tasks))
	except ValueError:
		print('Async Error')

class PoolAdder(object):
	def __init__(self,threshold):
		self._threshold = threshold
		self._conn = RedisClient()
		self._tester = ValidityTester()
		self._crawler = FreeProxyGetter()

	def is_over_threshold(self):
	'''
	judge if count is overflow
	'''
	if self._conn.queue_len >= self,_threshold:
		return True
	else:
		return False

	def add_to_queue(self):
		print('PoolAdder is working')
		proxy_count = 0
		while not self.is_over_threshold():
			for callback_lable in range(self._crawler.__CrawlFuncCount__):
				callback = self._crawler.__CrawlFunc__[callback_lable]
				raw_proxies = self._crawler.get_raw_proxies(callback)
				# test crawled proxies
				self._tester.set_raw_proxies(raw_proxies)
				self._tester.test()
				proxy_count += len(raw_proxies)
				if self.is_over_threshold():
					print('IP is enough, waiting to be used')
					break
			if proxy_count == 0:
				raise ResourceDepletionError

class Schedule(object):
	@staticmethod
	def valid_proxy(cycle=VALID_CHECK_CYCLE):
		'''
		Get half of proxies in redis
		'''
		conn = RedisClient()
		tester = ValidityTester()
		while True:
			print('Refreshing IP')
			count = int(0.5 * conn.queue_len)
			if count == 0:
				print('Waiting for adding')
				time.sleep(cycle)
				continue
			raw_proxies = conn.get(count)
			tester.set_raw_proxies(raw_proxies)
			tester.test()
			time.sleep(cycle)

	@staticmethod
	def check_pool(lower_threshold=POOL_LOWER_THRESHOLD,upper_threshold=POOL_UPPER_THRESHOLD,cycle=POOL_LEN_CHECK_CYCLE):
	'''
	If the number of proxies less than lower_threshold, add proxy
	'''
	conn = RedisClient()
	adder = PoolAdder(upper_threshold)
	while True:
		if conn.queue_len < lower_threshold:
			adder.add_to_queue()
			time.sleep(cycle)

	def run(self):
		print('IP processing running')
		valid_process = Process(target=Schedule.valid_proxy)
		check_process = Process(target=Schedule.check_pool)
		valid_process.start()
		check_process.start()

获取模块

getter.py

from .utils import get_page
from pyquery import PyQuery as pq
import re

class ProxyMetaclass(type):
	'''
	元类，在FreeProxyGetter类中加入__CrawlFunc__和__CrawlFuncCount__两个参数，分别表示爬虫函数和爬虫函数的数量
	'''
	def __new__(cls,name,bases,attrs):
		count=0
		attrs['__CrawlFunc__'] = []
		for k,v in attrs.items():
			if 'crawl_' in k:
				attrs['__CrawlFunc__'].append(k)
				count+=1
		attrs['__CrawlFuncCount__'] = count
		return type.__new__(cls,name,bases,attrs)

class FreeProxyGetter(object,metaclass=ProxyMetaclass):
	def get_raw_proxies(self,callback):
		proxies = []
		print('Callback',callback)
		for proxy in eval("self.{}()".format(callback)):
			print('Getting', proxy, 'from', callback)
			proxies.append(proxy)
		return proxies

	def crawl_kuaidaili(self):
		for page in range(1,4):
			start_url = 'https://www.kuaidaili.com/free/inha/{}/'
			html = get_page(start_url)
			ip_address = re.compile('<td data-title="IP">(.*)</td>\s*<td data-title="PORT">(\w+)</td>')
			re_ip_address = ip_address.findall(str(html))
			for address,port in re_ip_address:
				result = address+":"+port
				yield result.replace(' ','')

	def crawl_xicidaili(self):
		for page in range(1,4):
			start_url = 'https://www.xicidaili.com/nn/{}'.format(page)
			html = get_page(start_url)
			ip_address = re.compile('<td class="country"><img src="http://fs.xicidaili.com/images/flag/cn.png" alt="Cn"></td>\s*<td>(.*?)</td>\s*<td>\s*<td>(.*?)</td>')
			# \s* 匹配空格，换行
			re_ip_address = ip_address.findall(str(html))
			for address,port in re_ip_address:
				result = address+":"+port
				yield result.replace(' ','')

	def crawl_daili66(self,page_count=4):
		start_url = 'http://www.66ip.cn/{}.html'
		urls = [start_url.format(page) for page in range(1,page_count+1)]
		for url in urls:
			print('Crawling',url)
			html = get_page(url)
			if html:
				doc = pq(html)
				trs = doc('.containerbox table tr:gt(0)').items()
				for tr in trs:
					ip = tr.find('td:nth-child(1)').text()
					port = tr.find('td:nth-child(2)').text()
					yield ':'.join([ip,port])

api.py

from flask import Flask,g
from .db import RedisClient

__all__ = ['app']
app = Flask(__name__)

def get_conn():
	'''
	open a new redis connection if there is none yet for the current application context
	'''
	if not hasattr(g,'redis_client'):
		g.redis_client = RedisClient()
	return g.redis_client

@app.route('/')
def index():
	return '<h2>Welcome to my Proxy Pool</h2>'

@app.route('/get')
def get_proxy():
	conn = get_conn()
	return conn.pop()

@app.route('/count')
def get_counts():
	conn = get_conn()
	return str(conn.queue_len)

if __name__=='__main__':
	app.run()

其余部分

run.py

from proxypool.api import app
from proxypool.schedule import Schedule

def main()：
	s = Schedule()
	s.run()
	app.run()

if __name__=='__main__':
	main()

setting.py

HOST = 'localhost'
PORT = 6379
PASSWORD = ''
# 获得代理测试时间界限
get_proxy_timeout = 9
# 代理池数量限制
POOL_LOWER_THRESHOLD = 20
POOL_UPPER_THRESHOLD = 100
# 检查周期
VALID_CHECK_CYCLE = 60
POOL_LEN_CHECK_CYCLE = 20
# 测试api
TEST_API = 'https://www.baidu.com'

error.py

class ResourceDepletionError(Exception):
	def __init__(self):
		Exception.__init__(self)

	def __str__(self):
		return repr('The proxy source is exhaused')

class PoolEmptyError(Exception):
	def __init__(self):
		Exception.__init__(self)
	
	def __str__(self):
		return repr('The proxy pool is empty')