api.py
from flask import Flask,g
from proxypool.db import Reids_Client
__all__=['app']
app = Flask(__name__)
def get_conn():
if not hasattr(g,'reids_client'):
g.redis_client = Reids_Client()
return g.redis_client
@app.route('/')
def index():
return '<h1>欢迎来到代理池系统!</h1>'
@app.route('/get')
def get():
conn = get_conn()
proxy = conn.pop()
if isinstance(proxy,bytes):
return proxy.decode('utf-8')
return proxy
@app.route('/count')
def count():
return str(get_conn().queue_len)
db.py
import redis
from proxypool.setttings import PASSWORD,HOST,PORT,PROXIES
class Reids_Client(object):
def __init__(self,host=HOST,port = PORT):
if PASSWORD:
self._db =redis.Redis(host=host,port=port,password=PASSWORD)
else:
self._db = redis.Redis(host=host,port=port)
def put(self,proxy):
self._db.rpush(PROXIES,proxy)
def get(self,count=1):
proxies = self._db.lrange(PROXIES,0,count-1)
self._db.ltrim(PROXIES,count,-1)
return proxies
def pop(self):
return self._db.rpop(PROXIES).decode('utf-8')
@property
def queue_len(self):
return self._db.llen(PROXIES)
def flush(self):
self._db.flushall()
if __name__ == '__main__':
r = Reids_Client()
print(r.queue_len)
demo_with.py
import requests
str1 = 'a()'
def a():
print('a11111')
eval(str1)
run.py
from proxypool.api import app
from proxypool.Scheduler import Scheduler
def main():
s = Scheduler()
s.run()
app.run()
if __name__ == '__main__':
main()
setttings.py
PASSWORD = ''
HOST = 'localhost'
PORT = 6379
PROXIES = 'proxies_new'
TSET_API = 'https://www.baidu.com/'
TEST_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
}
TSET_TIME_OUT = 30
CYCLE_VALID_TIME = 100
LOWER_THRESHOLD = 10
UPPER_THRESHOLD = 100
CYCLE_CHECK_TIME = 100
demo_with.py
import requests
with requests.get('https://www.baidu.com/') as response:
print(response.text)
print(response.status_code)
getter.py
import requests
from lxml import etree
class ProxyMetaclass(type):
def __new__(cls, name,bases,attrs):
attrs['__CrwalFunc__'] = []
count = 0
for k,v in attrs.items():
if 'crawl_' in k:
attrs['__CrwalFunc__'].append(k)
count +=1
attrs['__CrwalCount__'] = count
return type.__new__(cls,name,bases,attrs)
class FreeProxyGetter(object,metaclass=ProxyMetaclass):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
}
def get_raw_proxies(self,callback):
'''
通过传入一个字符串的方法名,来调用这个方法,获取代理
:param callback: 方法名的字符串格式--crawl_66ip
:return: list-->代理
'''
proxies = []
for proxy in eval('self.{}()'.format(callback)):
proxies.append(proxy)
return proxies
def crawl_66ip(self):
'''
url:http://www.66ip.cn/
:return:list[proxy]
'''
proxies = []
base_url = 'http://www.66ip.cn/%s.html'
for i in range(1, 20):
response = requests.get(base_url % i, headers=self.headers)
html = etree.HTML(response.text)
ips = html.xpath('//tr[position()>1]/td[1]/text()')
ports = html.xpath('//tr[position()>1]/td[2]/text()')
if len(ips) == len(ports) and ips and ports:
for i, ip in enumerate(ips):
port = ports[i]
yield ip.strip()+':'+port.strip()
def crawl_ip3366(self):
'''
url:http://www.ip3366.net/?stype=1&page=1
:return:list[proxy]
'''
proxies = []
base_url = 'http://www.ip3366.net/?stype=1&page=%s'
for i in range(1, 11):
response = requests.get(base_url % i, headers=self.headers)
html = etree.HTML(response.text)
ips = html.xpath('//tr/td[1]/text()')
ports = html.xpath('//tr/td[2]/text()')
if len(ips) == len(ports) and ips and ports:
for i, ip in enumerate(ips):
port = ports[i]
yield ip.strip() + ':' + port.strip()
if __name__ == '__main__':
f = FreeProxyGetter()
print(f.__CrwalFunc__)
Scheduler.py
import time
from multiprocessing import Process
from threading import Thread
import aiohttp
from proxypool.setttings import *
import asyncio
from proxypool.db import Reids_Client
from proxypool.getter import FreeProxyGetter
class VaildityTester(object):
def __init__(self):
self._raw_proxies = []
def set_raw_proxies(self,proxies):
self._raw_proxies = proxies
self._conn = Reids_Client()
async def test_single_proxy(self,proxy):
try:
async with aiohttp.ClientSession() as session:
if isinstance(proxy, bytes):
proxy = proxy.decode('utf-8')
real_proxy = 'http://' + proxy
try:
async with session.get(TSET_API,
headers=TEST_REQUEST_HEADERS,
proxy=real_proxy,
timeout=TSET_TIME_OUT) as response:
if response.status == 200:
self._conn.put(proxy)
print('有效代理!', proxy)
except Exception:
print('无效代理!',proxy)
except Exception as e:
print(e)
def test(self):
print('代理池开始启动!')
loop = asyncio.get_event_loop()
tasks = [self.test_single_proxy(proxy) for proxy in self._raw_proxies]
loop.run_until_complete(asyncio.wait(tasks))
class PoolAdder(object):
def __init__(self,threshold):
self._threshold = threshold
self._tester = VaildityTester()
self._conn = Reids_Client()
self._crawler =FreeProxyGetter()
def is_over_threshold(self):
'''
:return: True:达到 False不能
'''
if self._conn.queue_len>=self._threshold:
return True
return False
def add_to_queue(self):
print('添加器开始工作....')
while True:
if self.is_over_threshold():
break
proxy_count = 0
for crawl in self._crawler.__CrwalFunc__:
try:
raw_proxies = self._crawler.get_raw_proxies(crawl)
except Exception:
continue
self._tester.set_raw_proxies(raw_proxies)
self._tester.test()
proxy_count+=len(raw_proxies)
if proxy_count==0:
print('代理网站全部失效,请添加!')
raise RuntimeError('代理网站全部失效!')
class Scheduler(object):
@staticmethod
def vaild_proxy(cycle = CYCLE_VALID_TIME):
conn = Reids_Client()
tester = VaildityTester()
while True:
print('循环校验器开始启动!')
count = int(conn.queue_len*0.5)
if count ==0:
print('代理池数量不足!正在添加...')
time.sleep(cycle)
raw_proxies = conn.get(count)
tester.set_raw_proxies(raw_proxies)
tester.test()
time.sleep(cycle)
@staticmethod
def check_pool_add(lower_threshold = LOWER_THRESHOLD,
upper_threshold = UPPER_THRESHOLD,
cycle=CYCLE_CHECK_TIME):
conn = Reids_Client()
adder = PoolAdder(upper_threshold)
while True:
if conn.queue_len<lower_threshold:
adder.add_to_queue()
time.sleep(cycle)
def run(self):
vaild_process =Process(target=Scheduler.vaild_proxy)
check_process =Process(target=Scheduler.check_pool_add)
vaild_process.start()
check_process.start()