ip代理池(获取,存储,检查,接口)

ip代理池
基本库:aiohttp、request、redis_py、pyquery、Flask
基本模块:存储模块,获取模块(IP),检测模块,接口模块,调度模块(用于串连前面4个模块)

StorageModule(存储模块)
AcquisitionModule(获取模块)
CheckingModule(检测模块)
InterfaceModule(接口模块)
SchedulingModule(调度模块)

调度顺序:获取模块------> 存储模块<-------> 检测模块

接口模块

获取模块:
定时在各大代理网站抓取代理。代理可以是付费也可以是免费。代理形式:IP+端口。抓取成功后将可用代理保存在数据库中。

存储模块:
负责存储抓取来下的代理。保证代理不重复,要标识代理的可用情况,还要动态实时处理没个代理,使用Redis的SortedSet,即有序集合。

检查模块:
定时检测数据库中的代理,设置一检测链接,最好是爬哪个网站就检查哪个网站,这样更加有针对性。
如果是通用型的代理,可以设置百度等链接来检测。另外需要标识每一个代理的状态,
如设置分数标识。100分代表可用,分数越少代表越不可用。检测一次如果代理可用,将分数标识立即设置为100,或在原基础分上加1分;
如代理不可用,将分数标识减1分。当分数减到一定值后,代理直接从数据库移除。通过标识分数,可以辨别代理的可用情况。

接口模块:
需要用API 来提供对外服务的接口。其实可以直接连接数据库来获取对应的数据,但这样就需要知道数据库的连接信息,并且需要匹配连接。
而比较安全的方式就是提供一个Web API 接口。通过访问接口即可拿到可用代理。这样能保证每个可用代理都可以取到,实现负载均衡。

目录
目录

AcquisitionModule(获取模块)

# 获取网页信息
import requests
from requests.exceptions import ConnectionError

base_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/54.0.2840.71 Safari/537.36',
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7'
}

def get_page(url, options={}):
    """
    抓取代理
    :param url:
    :param options:
    :return:
    """
    headers = dict(base_headers,**options)
    print('正在抓取网页:',url)

    try:
        response = requests.get(url,headers=headers)
        print('网页访问状态',url,'状态码:',response.status_code)
        if response.status_code == 200:
            print('网页抓取成功',url)
            return response.text
    except ConnectionError as e:
        print('网页抓取失败:',url,'失败原因:',e)
        return None

# 测试代码
# if __name__ == '__main__':
#     get_page('https://www.baidu.com/')
# 获取ip代理
import re
import json
import time
from AcquisitionModule.utils import get_page
from pyquery import PyQuery as pq


class ProxyMetaclass(type):

    def __new__(cls, name, bases, attrs):
        count = 0
        attrs['__CrawlFunc__'] = []
        # print(attrs)
        for k, v in attrs.items():
            # print('k:',k,'v:',v)
            if 'crawl_' in k:
                attrs['__CrawlFunc__'].append(k)
                count += 1
        attrs['__CrawlFuncCount__'] = count
        return type.__new__(cls, name, bases, attrs)

class Crawler(object, metaclass=ProxyMetaclass):
    def get_proxies(self,callback):
        proxies = []
        # print(callback)
        # print("self.{}()".format(callback))  #结果 self.crawl_kuaidaili()
        for proxy in eval("self.{}()".format(callback)):
            print('成功获取到代理',proxy)
            proxies.append(proxy)
        return proxies

    def crawl_kuaidaili(self,page_count=2):
        """
        获取代理快代理(https://www.kuaidaili.com)
        :param page_count:需要爬的页数
        :return: ip,port
        """
        start_url = 'https://www.kuaidaili.com/free/inha/{}/'
        urls = [start_url.format(page) for page in range(1,page_count)]
        # print(urls)
        for url in urls:
            # print('Crawling:',url)
            html = get_page(url)
            time.sleep(1)
            if html:
                ip_address = re.compile('<td data-title="IP">(.*?)</td>')
                re_ip_address = ip_address.findall(html)
                # print(re_ip_address)
                port = re.compile('<td data-title="PORT">(.*?)</td>')
                re_port = port.findall(html)
                # print(re_port)
                for address,port in zip(re_ip_address,re_port):
                    address_port = address + ':' + port
                    # print(address_port)
                    yield address_port.replace('','')

# 测试代码
# if __name__ == '__main__':
#     crawler = Crawler()
#     # proxies = crawler.get_proxies()
#     # print(proxies)
#     address_ports = crawler.crawl_kuaidaili()
#     for address_port in address_ports:
#         print(address_port)
# 用来动态调取所有以crawl开头的方法。然后抓取到代理。将其加到数据库存储起来:

from AcquisitionModule.crawler import Crawler
from StorageModule.db import RedisClient
from proxypoolsetting import ProxyPoolSetting
import sys

# P00L_UPPER_THRESHOLD = 10000
setting = ProxyPoolSetting()
P00L_UPPER_THRESHOLD = setting.P00L_UPPER_THRESHOLD

class Getter():
    def __init__(self):
        self.crawler = Crawler()
        self.redis = RedisClient()

    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        :return:
        """
        if self.redis.count() >= P00L_UPPER_THRESHOLD:
            return True
        else:
            return False

    def run(self):
        """
        运行程序
        :return:
        """
        print('获取器开始执行')
        # print(self.crawler.__CrawlFuncCount__)
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                # print('callback_label:',callback_label)
                callback = self.crawler.__CrawlFunc__[callback_label]
                # print(callback)
                proxies = self.crawler.get_proxies(callback)
                # sys.stdout.flush()
                for proxy in proxies:
                    self.redis.add(proxy)

# 测试本代码
# if __name__ == "__main__":
#     getter = Getter()
#     getter.run()

StorageModule(存储模块)

# 储存模块

import redis
from random import choice
from StorageModule.error import PoolEmptyError
import re

REDIS_HOST = 'localhost'
REDIS_PORT = 6379
REDIS_PASSWORD = None
REDIS_KEY = 'kproxies'

INITTAL_SCORE = 10
MAX_SCORE = 20
MIN_SCORE = 0

class RedisClient(object):
    def __init__(self,host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD):
        """
        初始化
        :param host: 地址
        :param port: 端口
        :param password: 密码
        """
        self.db = redis.StrictRedis(host=host,port=port,password=password,decode_responses=True)

    def add(self,proxy,score=INITTAL_SCORE):
        """
        添加代理
        :param proxy:代理
        :param score: 分数
        :return:添加结果
        """
        if not re.match('\d+\.\d+\.\d+\.\d+\:\d+', proxy):
            print('代理不符合规范', proxy, '丢弃')
            return
        if not self.db.zscore(REDIS_KEY,proxy):
            # return self.db.zadd(REDIS_KEY,score,proxy)
            return self.db.zadd(REDIS_KEY,{proxy:score})

    def random(self):
        """
        随机获取有效代理,首先尝试最高分代理,如果最高分不存在,则按照排名获取,否则异常
        :return: 随机代理
        """
        result = self.db.zrangebyscore(REDIS_KEY,MAX_SCORE,MAX_SCORE)
        if len(result):
            return choice(result)
        else:
            result = self.db.zrevrange(REDIS_KEY,0,100)
            if len(result):
                return choice(result)
            else:
                raise PoolEmptyError

    def decrease(self,proxy):
        """
        代理值减1分,分数小于最小值,则代理删除
        :param proxy: 代理
        :return: 修改后的代理分数
        """
        score = self.db.zscore(REDIS_KEY,proxy)
        if score and score > MIN_SCORE:
            print('代理', proxy,'当前分数', score,'减1')
            # return self.db.zincrby(REDIS_KEY,proxy,-1)
            return self.db.zincrby(REDIS_KEY, -1, proxy)
        else:
            print('代理', proxy,'当前分数', score,'移除')
            return self.db.zrem(REDIS_KEY, proxy)

    def exists(self, proxy):
        """
        判断是否存在
        :param proxy:代理
        :return: 是否存在
        """
        return not self.db.zscore(REDIS_KEY,proxy) == None

    def  max(self,proxy):
        """
        将代理设置为最大值 MAX_SCORE
        :param proxy: 代理
        :return: 设置结果
        """
        print('代理', proxy,'可用,设置为', MAX_SCORE)
        # return self.db.zadd(REDIS_KEY,MAX_SCORE,proxy) #旧redis写法,丢弃

        #当代理可用时,设置20
        return self.db.zadd(REDIS_KEY,{proxy:MAX_SCORE})

        #当代理可用,加1
        # score = self.db.zscore(REDIS_KEY, proxy)
        # print('代理可用', proxy, '当前分数', score, '加1')
        # return self.db.zincrby(REDIS_KEY, +1, proxy)

    def count(self):
        """
        获取数量
        :return:数量
        """
        return self.db.zcard(REDIS_KEY)

    def all(self):
        """
        获取全部代理
        :return: 全部代理列表
        """
        return self.db.zrangebyscore(REDIS_KEY,MIN_SCORE,MAX_SCORE)

#测试
# if __name__ == '__main__':
#     redis_client = RedisClient()
#     proxy = '222.189.190.151:9999'
#     # redis_client.add(proxy)
#     redis_client.decrease(proxy)
class PoolEmptyError(Exception):

    def __init__(self):
        Exception.__init__(self)

    def __str__(self):
        return repr('代理池已经枯竭')

CheckingModule(检测模块)

# 代理检查
from StorageModule.db import RedisClient
import aiohttp
import asyncio
import time
import sys
from proxypoolsetting import ProxyPoolSetting
try:
    from aiohttp import ClientError
except:
    from aiohttp import ClientProxyConnectionError as ProxyConnectionError

setting = ProxyPoolSetting()
# 状态码
# VALID_STATUS_CODES = [200]
VALID_STATUS_CODES = setting.VALID_STATUS_CODES
# 网站
# TEST_URL = 'http://www.baidu.com'
TEST_URL = setting.TEST_URL
# 批量测试最大值
# BATCH_TEST_SIZE = 100
BATCH_TEST_SIZE = setting.BATCH_TEST_SIZE

class Tester(object):
    def __init__(self):
        self.redis = RedisClient()

    async def test_single_proxy(self,proxy):
        """
        测单个代理
        :param proxy:单个代理
        :return: None
        """
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy,bytes):
                    proxy = proxy.decode('utf-8')
                real_proxy = 'http://'+proxy
                print('正在测试:', proxy)
                async with session.get(TEST_URL,proxy=real_proxy,timeout=15) as reaponse:
                    if reaponse.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print('代理可用:',proxy)
                    else:
                        self.redis.decrease(proxy)
                        print('请求响应码不合法:',proxy)
            except (ClientError, aiohttp.client_exceptions.ClientConnectorError,asyncio.TimeoutError, AttributeError):
                self.redis.decrease(proxy)
                print('代理请求失败', proxy)

    def run(self):
        """
        测试主函数
        :return: None
        """
        print("测试开始运行")
        try:
            proxise = self.redis.all()
            loop = asyncio.get_event_loop()
            # 批量测试
            for i in range(0,len(proxise),BATCH_TEST_SIZE):
                test_proxies = proxise[i:i + BATCH_TEST_SIZE]
                tasks = [self.test_single_proxy(proxy)for proxy in test_proxies]
                loop.run_until_complete(asyncio.wait(tasks))
                time.sleep(5)
        except Exception as e:
            print('测试器发生错误',e.args)

# 测试代码
# if __name__ == '__main__':
#     tester = Tester()
#     print('测试开始')
#     while True:
#         t = tester.run()

InterfaceModule(接口模块)

# web API 接口

from flask import Flask,g
from StorageModule.db import RedisClient

__all__ = ['app']
app = Flask(__name__)

def get_conn():
    """
    redis连接
    :return: g.redis
    """
    if not hasattr(g, 'redis'):
        g.redis = RedisClient()
        return g.redis

@app.route('/')
def index():
    """
    首页
    :return: None
    """
    return '<h2>Welcome to Proxy Pool Sysyem</h2>' \
           '<a href="http://127.0.0.1:5000/random">proxy</a>\n' \
           '<a href="http://127.0.0.1:5000/count">count</a>'

@app.route('/random')
def get_proxy():
    """
    获取随机可用代理
    :return: 随机代代理
    """
    conn = get_conn()
    return conn.random() + '\n<a href="http://127.0.0.1:5000">返回</a>'

@app.route('/count')
def get_count():
    """
    获取代理池总数
    :return: 代理池总数
    """
    conn = get_conn()
    return str(conn.count()) + '\n<a href="http://127.0.0.1:5000">返回</a>'

# 测试
# if __name__ == '__main__':
#     app.run()

SchedulingModule(调度模块)

from CheckingModul.tester import Tester
from AcquisitionModule.getter import Getter
from InterfaceModule.api import app
import time
from multiprocessing import Process
from proxypoolsetting import ProxyPoolSetting

setting = ProxyPoolSetting()
# 休眠时间
# TESTER_CYCLE = 20
TESTER_CYCLE = setting.TESTER_CYCLE
# GETTER_CYCLE = 30
GETTER_CYCLE = setting.GETTER_CYCLE
# 模块开关
# TESTER_ENABLED = True
TESTER_ENABLED = setting.TESTER_ENABLED
# GETTER_ENABLED = True
GETTER_ENABLED = setting.GETTER_ENABLED
# API_ENABLED = True
API_ENABLED = setting.API_ENABLED

class Scheduler():
    def schedule_tester(self,cycle=TESTER_CYCLE):
        """
        定时测试代理
        :return:
        """
        tester = Tester()
        while True:
            print("测试器开始运行")
            tester.run()
            time.sleep(cycle)

    def schedule_getter(self,cycle=GETTER_CYCLE):
        """
        定时获取代理
        :return:
        """
        getter = Getter()
        while True:
            print("开始抓取代理")
            getter.run()
            time.sleep(cycle)

    def schedule_api(self):
        """
        开启API
        :return:
        """
        app.run()

    def run(self):
        print('代理池开始运行')

        if GETTER_ENABLED:
            getter_process = Process(target=self.schedule_getter)
            getter_process.start()

        if TESTER_ENABLED:
            tester_process = Process(target=self.schedule_tester)
            tester_process.start()

        if API_ENABLED:
            api_process = Process(target=self.schedule_api)
            api_process.start()

# 测试代码
if __name__ == '__main__':
    scheduler = Scheduler()
    scheduler.run()

运行及设置

# 基础设置

class ProxyPoolSetting():
    def __init__(self):
        # 最大代理池条数
        self.P00L_UPPER_THRESHOLD = 10000
        # 状态码
        self.VALID_STATUS_CODES = [200]
        # 测试代理网站
        self.TEST_URL = 'http://www.baidu.com'
        # 批量测试最大值
        self.BATCH_TEST_SIZE = 100
        # 休眠时间
        self.TESTER_CYCLE = 20
        self.GETTER_CYCLE = 30
        # 模块开关
        self.TESTER_ENABLED = True
        self.GETTER_ENABLED = True
        self.API_ENABLED = True
# 运行
from SchedulingModule.scheduler import Scheduler

def run():
    s = Scheduler()
    s.run()

if __name__ == '__main__':
    run()
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值