关于ip代理池
成图
进来的小伙伴基本已经知道ip代理池的作用废话不多说上代码
文件目录。照着这个目录做文件就行
api.py
from flask import Flask,g
from proxypool.db import Redis_Client
#当import * *其实就是指__all__里面的内容才会被导入。
__all__ = ['app']
app = Flask(__name__)
#将connredis的连接,方法全局变量中,方便使用。
def get_conn():
if not hasattr(g,'redis_client'):
g.redis_client = Redis_Client()
return g.redis_client
@app.route('/')
def index():
return '<h1>欢迎来带代理池系统!</h1>'
@app.route('/get')
def get():
conn = get_conn()
proxy = conn.pop()
if isinstance(proxy,bytes):
proxy = proxy.decode('utf-8')
return proxy
@app.route('/count')
def count():
return str(get_conn().queue_len)
db.py
import redis
from proxypool.settings import PORT,HOST,PASSWORD,PROXIES
class Redis_Client(object):
#设置密码选择功能
def __init__(self,host = HOST,port=PORT):
if PASSWORD:
self._db = redis.Redis(host=host,port=port,password=PASSWORD)
else:
self._db = redis.Redis(host=host,port=port)
#当我们使用代理的时候,直接从尾部删除并获取一个最新的代理就可以了
def pop(self):
return self._db.rpop(PROXIES).decode('utf-8')
#获取代理,进行校验
#头部获取代理方法
def get(self,count = 1):
'''
从头部获取指定count数量的代理,同时将其删除。
:param count: 获取的数量
:return: 返回获取到的代理
'''
#获取
proxies = self._db.lrange(PROXIES,0,count -1)
#删除
self._db.ltrim(PROXIES,count,-1)
return proxies
#尾部添加方法
def put(self,proxy):
self._db.rpush(PROXIES,proxy)
#获取redis代理池大小的方法
@property
def queue_len(self):
return self._db.llen(PROXIES)
#清空代理池
def fulsh(self):
self._db.flushall()#清空所有
getter.py
import requests,re
from lxml import etree
class ProxyMetaclass(type):
# print('自定义元类工作!')
def __new__(cls,classname,bases,attrs):
'''
一个类实例化:
(1)__new__
(2)__init__
:param classname: 类的名称
:param bases: 元组,继承的类
:param attrs: 字典,所有的属性。
:return:
'''
# print(attrs)
count = 0
attrs['__Crawl_Func__'] = []
for k,v in attrs.items():
if 'crawl_' in k:
attrs['__Crawl_Func__'].append(k)
count +=1
attrs['__Crawl_Count__'] = count
return type.__new__(cls,classname,bases,attrs)
class FreeProxyGetter(object,metaclass=ProxyMetaclass):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
}
def get_raw_proxies(self,crawl):
'''
:param crawl: crawl_xicidaili()方法的字符串名称
:return: 返回从各个代理网站爬取下来的代理list
'''
proxies = []
#尽量避免使用。
for proxy in eval('self.{}()'.format(crawl)):
proxies.append(proxy)
return proxies
def crawl_xicidaili(self):
'''
抓取:https://www.xicidaili.com/nn/
:return: list[proxy]
'''
base_url = 'https://www.xicidaili.com/nn/%s'
proxies = []
for i in range(1,10):
response = requests.get(base_url %i,headers = self.headers)
# print(response.text)
tr_p = re.compile(r'<tr(.*?)</tr>',re.S)
tr_list = tr_p.findall(response.text)
# print(tr_list)
for i in tr_list[1:]:
p = re.compile(r'<td>(.*?)</td>',re.S)
infos = p.findall(i)
ip = infos[0]
port = infos[1]
proxy = ip.strip()+':'+port.strip()
print(proxy)
yield proxy
# proxies.append(proxy)
def crawl_66Ip(self):
'''
url:http://www.66ip.cn/
:return:list[proxy]
'''
base_url = 'http://www.66ip.cn/%s.html'
for i in range(1,20):
response = requests.get(base_url %i,headers = self.headers)
html= etree.HTML(response.text)
ips = html.xpath('//tr[position()>1]/td[1]/text()')
ports = html.xpath('//tr[position()>1]/td[2]/text()')
if len(ips)==len(ports) and ips and ports:
for i,ip in enumerate(ips):
port = ports[i]
yield ip.strip()+':'+port.strip()
def crawl_ip3366(self):
'''
url:http://www.ip3366.net/?stype=1&page=1
:return:list[proxy]
'''
base_url = 'http://www.ip3366.net/?stype=1&page=%s'
for i in range(1,11):
response = requests.get(base_url %i,headers = self.headers)
html= etree.HTML(response.text)
ips = html.xpath('//tr/td[1]/text()')
ports = html.xpath('//tr/td[2]/text()')
if len(ips)==len(ports) and ips and ports:
for i,ip in enumerate(ips):
port = ports[i]
yield ip.strip()+':'+port.strip()
# print(ip,port)
if __name__ == '__main__':
f = FreeProxyGetter()
# for crawl in f.__Crawl_Func__:
# print(crawl)
f.crawl_ip3366()
run.py
from proxypool.scheduler import Schedule
from proxypool.api import app
def main():
s = Schedule()
s.run()
app.run()
if __name__ == '__main__':
main()
scheduler.py
import time
import multiprocessing
from .db import Redis_Client
import aiohttp
from proxypool.getter import FreeProxyGetter
from proxypool.settings import *
import asyncio
class ValidityTester(object):
def __init__(self):
#list--->将来如果想要校验,只需要将代理ip添加到这个list中就可以了。
self._raw_proxies = []
def set_raw_proxies(self,proxies):
self._raw_proxies = proxies
self._conn = Redis_Client()
#用异步方法来校验代理
async def test_single_proxy(self,proxy):
try:
async with aiohttp.ClientSession() as session:
try:
# 参数校验
# ip:port
if isinstance(proxy, bytes):
proxy = proxy.decode('utf-8')
real_proxy = 'http://' + proxy
async with session.get(url=TEST_AIP,
headers=TEST_AIP_HEADERS,
timeout=TIME_OUT, proxy=real_proxy) as response:
if response.status == 200:
# 成功之后,就要将这个正确的代理放到代理池
self._conn.put(proxy)
print('有效代理!', proxy)
except Exception:
print('无效代理!',proxy)
except Exception:
pass
def test(self):
print('代理池校验器开始工作!')
try:
# 创建loop
loop = asyncio.get_event_loop()
# 执行任务列表
tasks = [self.test_single_proxy(proxy) for proxy in self._raw_proxies]
# 启动loop
loop.run_until_complete(asyncio.wait(tasks))
except Exception:
print('Async Error')
class PoolAdder(object):
def __init__(self,threshold):
self._threshold = threshold#代理池的最大容量
self._crawler = FreeProxyGetter()
self._tester = ValidityTester()
self._conn = Redis_Client()
#添加器的终止条件
#判断代理池的数量是否达到最大值的方法
def is_over_threshold(self):
'''
:return: True表示超过最大值
'''
if self._conn.queue_len>=self._threshold:
return True
return False
def add_to_queue(self):
print('代理池正在工作中!')
proxy_count = 0
while True:
if self.is_over_threshold():
break
#1。调用getter中的各个爬取各种代理网站的方法,crawl_xicidaili,carwl_66IP....
# self._tester.set_raw_proxies(self._crawler.crawl_xicidaili())
# self._tester.test()
# self._tester.set_raw_proxies(self._crawler.crawl_carwl_66IP())
# self._tester.test()
# self._tester.set_raw_proxies(self._crawler.crawl_carwl_88IP())
# self._tester.test()
# self._tester.set_raw_proxies(self._crawler.crawl_carwl_99IP())
# self._tester.test()
for crawl in self._crawler.__Crawl_Func__:
try:
raw_proxies = self._crawler.get_raw_proxies(crawl)
except Exception:
print('代理网站出错!',crawl)
continue
#2。在调用校验器来校验+添加
self._tester.set_raw_proxies(raw_proxies)
self._tester.test()
#验证raw_proxies合法性
proxy_count +=len(raw_proxies)
if proxy_count==0:
print('代理网站全部失效!')
raise RuntimeError('代理网站全部失效!')
class Schedule(object):
@staticmethod
def valid_proxy(cycle=CYCLE_VAILD_TIME):
'''
从代理池的头部获取count个代理,进行校验,有效的代理添加到代理吃的尾部
:return:
'''
conn = Redis_Client()
tester = ValidityTester()
while True:
print('循环校验器开始启动!')
#每次从代理池中取出一般进行校验
count = int(0.5*conn.queue_len)
if count ==0:
print('代理池数量不足,正在添加中!')
time.sleep(cycle)
continue
#从头部取出代理
raw_proxies = conn.get(count)
tester.set_raw_proxies(raw_proxies)
tester.test()
time.sleep(cycle)
@classmethod
def check_pool(cls,lower_threshold = LOWER_THRESHOLD,
upper_threshold = UPPER_THRESHOLD,
cycle = CYCLE_ADDER_TIME):
#如果代理池的数量低于最小值,就添加
conn = Redis_Client()
adder = PoolAdder(upper_threshold)
while True:
if conn.queue_len<lower_threshold:
adder.add_to_queue()
time.sleep(cycle)
def run(self):
print('代理池开始启动!')
vaild_process = multiprocessing.Process(target=Schedule.valid_proxy)
check_process = multiprocessing.Process(target=Schedule.check_pool)
vaild_process.start()
check_process.start()
setting.py
#redis的密码,如果为字符串,表示没密码。
PASSWORD = ''
PORT = '6379'
HOST = 'localhost'
PROXIES = 'proxies'
#测试aip
TEST_AIP = 'https://www.baidu.com/'
TEST_AIP_HEADERS = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
}
#测试超时时长
TIME_OUT = 30
#循环校验时间
CYCLE_VAILD_TIME =60
#循环添加时间
CYCLE_ADDER_TIME = 60
#代理池代理的最小值
LOWER_THRESHOLD = 10
#代理池代理的最大值
UPPER_THRESHOLD = 100