自建代理池

MAX_SCORE = 100 
MIN_SCORE = 0 
INITIAL_SCORE = 10 
REDIS_HOST = "127.0.0.1"
REDIS_PORT = 6379
REDIS_PASSWORD = None 
REDIS_KEY = "proxies"
import redis 
from random import choice 
import time
import asyncio
import aiohttp
import csv 
class RedisClient(object):
    def __init__(self,host=REDIS_HOST,port=REDIS_PORT,password=REDIS_PASSWORD):
        """
            初始化
        """
        self.db = redis.StrictRedis(host=host,port=port,password=password,decode_responses=True)
        
    def add(self,proxy,score=INITIAL_SCORE):
        """
        添加代理,设置分数为最高
        :param proxy:代理
        :param score:分数
        :return:添加结果
        """
        if not self.db.zscore(REDIS_KEY,proxy):
            return self.db.zadd(REDIS_KEY,score,proxy)
    def random(self):
        """
        随机获取有效代理,首先尝试获取最高分数代理,如果最高分数不存在,则按照排名获取,否则异常
        :return :随机代理
        """
        result = self.db.zrangebyscore(REDIS_KEY,MAX_SCORE,MAX_SCORE)
        if len(result):
            return choice(result)
        else:
            result = self.db.zrevrange(REDIS_KEY,0,100)
            if len(result):
                return choice(result)
            else:
                raise PoolEmptyError 
    def decrease(self,proxy):
        """
        代理值减一分,分数小于最小值,则代理删除
        :param proxy:代理
        :return:修改后的代理分数
        """
        score = self.db.zscore(REDIS_KEY,proxy)
        if score and score > MIN_SCORE:
            print("代理",proxy,"当前分数",score,"减一")
            return self.db.zincrby(REDIS_KEY,proxy,-1)
        else:
            print("代理",proxy,"当前分数",score,"移除")
            return self.db.zrem(REDIS_KEY,proxy)
    def exists(self,proxy):
        """
        判断是否存在
        :param proxy:代理
        :return: 是否存在
        """
        return not self.db.zscore(REDIS_KEY,proxy) == None
    def max(self,proxy):
        """
        将代理设置为MAX_SCORE
        :param proxy:代理
        :return:设置结果
        """
        print("代理",proxy,"可用",'设置为',MAX_SCORE)
        return self.db.zadd(REDIS_KEY,MAX_SCORE,proxy)
    def count(self):
        """
        获取数量
        :return :数量
        """
        return self.db.zcard(REDIS_KEY)
    def all(self):
        """
        获取全部代理
        :return :全部代理列表
        
        """
        return self.db.zrangebyscore(REDIS_KEY,MIN_SCORE,MAX_SCORE)

import json
from pyquery import PyQuery as pq 
class ProxyMetaclass(type):
    def __new__(cls,name,bases,attrs):
        count = 0 
        attrs["__CrawlFunc__"] = []
        for k,v in attrs.items():
            if 'crawl_' in k:
                attrs["__CrawlFunc__"].append(k)
                count += 1 
        attrs["__CrawlFuncCount__"] = count 
        return type.__new__(cls,name,bases,attrs)
import requests
class Crawler(object,metaclass=ProxyMetaclass):
    def get_proxies(self,callback):
        proxies = [] 
        for proxy in eval("self.{}()".format(callback)):
            print("成功获取到代理",proxy)
            proxies.append(proxy)
        return proxies 
    # def crawl_daili66(self,page_count=4):
    #     start_url = "http://www.66ip.cn/{}.html"
    #     urls = [ start_url.format(page) for page in range(1,page_count+1)]
    #     for url in urls:
    #         print("Crawling",url)
    #         html = requests.get(url).text
    #         if html:
    #             doc = pq(html)
    #             trs = doc(".containerbox table tr:gt(0)").items()
    #             for tr in trs:
    #                 ip = tr.find("td:nth-child(1)").text()
    #                 port = tr.find("td:nth-child(2)").text()
    #                 yield ":".join([ip,port])
    def crawl_xiaohuan(self):
        with open("data.csv") as f:
            reader = csv.reader(f)
            for row in reader:
                if row:
                    yield row[0]

POOL_UPPER_THRESHOLD = 10000
class Getter():
    def __init__(self):
        self.redis = RedisClient()
        self.crawler = Crawler()
    def is_over_threshold(self):
        """
        判断是否达到了代理池限制
        """
        if self.redis.count() >= POOL_UPPER_THRESHOLD:
            return True 
        else:
            return False 
    def run(self):
        print("获取器开始执行")
        if not self.is_over_threshold():
            for callback_label in range(self.crawler.__CrawlFuncCount__):
                callback = self.crawler.__CrawlFunc__[callback_label]
                proxies = self.crawler.get_proxies(callback)
                for proxy in proxies:
                    self.redis.add(proxy)
VALID_STATUS_CODES = [200]
TEST_URL = "http://www.quanshuwang.com"
BATCH_TEST_SIZE = 100 
class Tester(object):
    def __init__(self):
        self.redis = RedisClient() 
    async def test_single_proxy(self,proxy):
        """
        测试单个代理
        :param proxy:单个代理
        :return:None 
        
        """
        headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
        }
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                if isinstance(proxy,bytes):
                    proxy = proxy.decode("utf-8")
                real_proxy = "http://"+ proxy 
                print("正在测试",proxy)
                async with session.get(TEST_URL,proxy=real_proxy,headers=headers,timeout=5) as response:
                    if response.status in VALID_STATUS_CODES:
                        self.redis.max(proxy)
                        print("代理可用",proxy)
                    else:
                        self.redis.decrease(proxy)
                        print("请求响应码不合法",proxy)
            except Exception as e:
                print(e.args)
                self.redis.decrease(proxy)
                print("代理请求失败",proxy)
    def run(self):
        print("测试器开始运行")
        try:
            proxies = self.redis.all()
            loop = asyncio.get_event_loop()
            #批量测试
            for i in range(0,len(proxies),BATCH_TEST_SIZE):
                test_proxies = proxies[i:i+BATCH_TEST_SIZE]
                tasks = [self.test_single_proxy(proxy)for proxy in test_proxies]
                loop.run_until_complete(asyncio.wait(tasks))
                #time.sleep(5)
        except Exception as e:
            print("测试器发生错误",e.args)
from flask import Flask,g 
__all__=['app']
app = Flask(__name__)
def get_conn():
    if not hasattr(g,'redis'):
        g.redis = RedisClient() 
    return g.redis 
@app.route("/")
def index():
    return "<h2>Welcome to Proxy Pool Systen</h2>"
@app.route("/random")
def get_proxy():
    """
    获取随机可用代理
    :return 随机代理
    """
    conn = get_conn()
    return conn.random() 
@app.route("/count")
def get_counts():
    """
    获取代理池总量
    :return 代理池总量
    """
    conn = get_conn()
    return str(conn.count())

TESTER_CYCLE = 2 
GETTER_CYCLE = 20
TESTER_ENABLED = False
GETTER_ENABLED = False
API_ENABLED = True 
API_HOST = "127.0.0.1" 
API_PORT = 5555
from multiprocessing import Process 
class Scheduler():
    def schedule_tester(self,cycle=TESTER_CYCLE):
        """
        定时测试代理
        
        """
        tester = Tester()
        while  True:
            print("测试器开始运行")
            tester.run()
            time.sleep(cycle)
    def schedule_getter(self,cycle=GETTER_CYCLE):
        """
        定时获取代理
        """
        getter = Getter()
        while True:
            print("开始抓取代理")
            getter.run()
            time.sleep(cycle)
    def schedule_api(self):
        """
        开启api
        """
        app.run(API_HOST,API_PORT)
    def run(self):
        print("代理池开始运行")
        if TESTER_ENABLED:
            tester_process = Process(target=self.schedule_tester)
            tester_process.start() 
        if GETTER_ENABLED:
            getter_process = Process(target=self.schedule_getter)
            getter_process.start()
        if API_ENABLED:
            api_process = Process(target=self.schedule_api)
            api_process.start()
Scheduler().run()      

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值