如何检测代理IP的质量和稳定性

前言

买了代理IP却不知道质量如何?用着用着突然就不行了?这些问题相信做爬虫的朋友都遇到过。

代理IP的质量直接影响爬虫的成功率,而稳定性则关系到项目能否持续运行。今天分享一套完整的代理IP检测方案,帮你筛选出真正好用的代理。

代理IP质量评估指标

核心指标

1. 可用性(Availability)

  • 能否正常连接
  • HTTP状态码是否正确
  • 响应内容是否完整

2. 速度(Speed)

  • 连接建立时间
  • 响应时间
  • 下载速度

3. 匿名性(Anonymity)

  • 是否暴露真实IP
  • 是否有代理特征
  • 匿名等级

4. 稳定性(Stability)

  • 连续可用时间
  • 失败率趋势
  • 服务中断频率

5. 地理位置(Location)

  • IP归属地
  • 与目标服务器距离
  • 时区匹配度

基础检测工具

简单可用性检测

import requests
import time
from datetime import datetime

class BasicProxyChecker:
    def __init__(self):
        self.test_urls = [
            'http://httpbin.org/ip',
            'https://api.ipify.org?format=json',
            'http://icanhazip.com'
        ]
    
    def check_proxy(self, proxy_url, timeout=10):
        """基础代理检测"""
        result = {
            'proxy': proxy_url,
            'timestamp': datetime.now().isoformat(),
            'working': False,
            'response_time': None,
            'ip_address': None,
            'error': None
        }
        
        try:
            start_time = time.time()
            
            response = requests.get(
                self.test_urls[0],
                proxies={
                    'http': proxy_url,
                    'https': proxy_url
                },
                timeout=timeout,
                headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                }
            )
            
            end_time = time.time()
            
            if response.status_code == 200:
                result['working'] = True
                result['response_time'] = end_time - start_time
                
                # 提取IP地址
                try:
                    ip_data = response.json()
                    result['ip_address'] = ip_data.get('origin', '').split(',')[0].strip()
                except:
                    result['ip_address'] = response.text.strip()
                    
        except Exception as e:
            result['error'] = str(e)
        
        return result
    
    def quick_test(self, proxy_list):
        """快速批量测试"""
        results = []
        
        print(f"开始测试 {len(proxy_list)} 个代理...")
        
        for i, proxy in enumerate(proxy_list, 1):
            print(f"[{i}/{len(proxy_list)}] 测试: {proxy}")
            
            result = self.check_proxy(proxy)
            results.append(result)
            
            if result['working']:
                print(f"✓ 可用 - IP: {result['ip_address']} - 响应时间: {result['response_time']:.2f}s")
            else:
                print(f"✗ 不可用 - {result['error']}")
        
        # 统计结果
        working_count = sum(1 for r in results if r['working'])
        print(f"\n测试完成: {working_count}/{len(proxy_list)} 个代理可用")
        
        return results

# 使用示例
checker = BasicProxyChecker()

test_proxies = [
    'http://username:password@proxy1.com:8080',
    'http://username:password@proxy2.com:8080',
    'http://username:password@proxy3.com:8080',
]

results = checker.quick_test(test_proxies)

深度质量检测

import requests
import time
import json
import socket
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed

class AdvancedProxyAnalyzer:
    def __init__(self):
        self.test_endpoints = {
            'ip_check': 'http://httpbin.org/ip',
            'headers_check': 'http://httpbin.org/headers',
            'geo_check': 'http://ip-api.com/json',
            'speed_test': 'http://httpbin.org/bytes/1024',  # 1KB数据
            'large_speed_test': 'http://httpbin.org/bytes/102400'  # 100KB数据
        }
        
    def comprehensive_test(self, proxy_url):
        """全面的代理质量检测"""
        results = {
            'proxy': proxy_url,
            'timestamp': datetime.now().isoformat(),
            'basic_info': {},
            'speed_metrics': {},
            'anonymity_level': 'unknown',
            'geo_info': {},
            'stability_score': 0,
            'overall_score': 0
        }
        
        try:
            # 基础信息检测
            results['basic_info'] = self._test_basic_connectivity(proxy_url)
            
            if results['basic_info']['working']:
                # 速度测试
                results['speed_metrics'] = self._test_speed(proxy_url)
                
                # 匿名性测试
                results['anonymity_level'] = self._test_anonymity(proxy_url)
                
                # 地理位置信息
                results['geo_info'] = self._get_geo_info(proxy_url)
                
                # 稳定性测试
                results['stability_score'] = self._test_stability(proxy_url)
                
                # 计算综合评分
                results['overall_score'] = self._calculate_score(results)
                
        except Exception as e:
            results['error'] = str(e)
        
        return results
    
    def _test_basic_connectivity(self, proxy_url):
        """基础连通性测试"""
        try:
            start_time = time.time()
            response = requests.get(
                self.test_endpoints['ip_check'],
                proxies={'http': proxy_url, 'https': proxy_url},
                timeout=15
            )
            end_time = time.time()
            
            if response.status_code == 200:
                ip_data = response.json()
                return {
                    'working': True,
                    'response_time': end_time - start_time,
                    'ip_address': ip_data.get('origin', '').split(',')[0].strip(),
                    'status_code': response.status_code
                }
        except Exception as e:
            return {
                'working': False,
                'error': str(e)
            }
    
    def _test_speed(self, proxy_url):
        """速度性能测试"""
        speed_results = {}
        
        # 小文件下载速度
        try:
            start_time = time.time()
            response = requests.get(
                self.test_endpoints['speed_test'],
                proxies={'http': proxy_url, 'https': proxy_url},
                timeout=30
            )
            end_time = time.time()
            
            if response.status_code == 200:
                download_time = end_time - start_time
                file_size_kb = len(response.content) / 1024
                speed_results['small_file'] = {
                    'download_time': download_time,
                    'speed_kbps': file_size_kb / download_time if download_time > 0 else 0
                }
        except:
            speed_results['small_file'] = {'error': 'Failed'}
        
        # 大文件下载速度
        try:
            start_time = time.time()
            response = requests.get(
                self.test_endpoints['large_speed_test'],
                proxies={'http': proxy_url, 'https': proxy_url},
                timeout=60
            )
            end_time = time.time()
            
            if response.status_code == 200:
                download_time = end_time - start_time
                file_size_kb = len(response.content) / 1024
                speed_results['large_file'] = {
                    'download_time': download_time,
                    'speed_kbps': file_size_kb / download_time if download_time > 0 else 0
                }
        except:
            speed_results['large_file'] = {'error': 'Failed'}
        
        return speed_results
    
    def _test_anonymity(self, proxy_url):
        """匿名性等级测试"""
        try:
            # 获取原始IP
            original_response = requests.get(
                self.test_endpoints['ip_check'], 
                timeout=10
            )
            original_ip = original_response.json()['origin']
            
            # 通过代理获取headers
            proxy_response = requests.get(
                self.test_endpoints['headers_check'],
                proxies={'http': proxy_url, 'https': proxy_url},
                timeout=15
            )
            
            if proxy_response.status_code != 200:
                return 'unknown'
            
            headers = proxy_response.json()['headers']
            headers_str = json.dumps(headers).lower()
            
            # 检查是否暴露真实IP
            if original_ip in headers_str:
                return 'transparent'
            
            # 检查代理特征headers
            proxy_indicators = [
                'x-forwarded-for', 'x-real-ip', 'via', 'x-proxy',
                'forwarded', 'x-forwarded', 'proxy-connection'
            ]
            
            if any(indicator in headers_str for indicator in proxy_indicators):
                return 'anonymous'
            
            return 'elite'
            
        except:
            return 'unknown'
    
    def _get_geo_info(self, proxy_url):
        """获取地理位置信息"""
        try:
            response = requests.get(
                self.test_endpoints['geo_check'],
                proxies={'http': proxy_url, 'https': proxy_url},
                timeout=15
            )
            
            if response.status_code == 200:
                data = response.json()
                return {
                    'country': data.get('country'),
                    'country_code': data.get('countryCode'),
                    'region': data.get('regionName'),
                    'city': data.get('city'),
                    'isp': data.get('isp'),
                    'org': data.get('org'),
                    'timezone': data.get('timezone'),
                    'is_datacenter': data.get('hosting', False)
                }
        except:
            pass
        
        return {}
    
    def _test_stability(self, proxy_url, test_count=5):
        """稳定性测试"""
        success_count = 0
        total_response_time = 0
        
        for i in range(test_count):
            try:
                start_time = time.time()
                response = requests.get(
                    self.test_endpoints['ip_check'],
                    proxies={'http': proxy_url, 'https': proxy_url},
                    timeout=10
                )
                end_time = time.time()
                
                if response.status_code == 200:
                    success_count += 1
                    total_response_time += (end_time - start_time)
                
                time.sleep(1)  # 间隔1秒
                
            except:
                pass
        
        stability_rate = success_count / test_count
        avg_response_time = total_response_time / success_count if success_count > 0 else 0
        
        return {
            'success_rate': stability_rate,
            'avg_response_time': avg_response_time,
            'test_count': test_count
        }
    
    def _calculate_score(self, results):
        """计算综合评分(0-100)"""
        score = 0
        
        # 基础可用性 (30分)
        if results['basic_info'].get('working'):
            score += 30
            
            # 响应时间评分 (20分)
            response_time = results['basic_info'].get('response_time', 999)
            if response_time < 1:
                score += 20
            elif response_time < 3:
                score += 15
            elif response_time < 5:
                score += 10
            elif response_time < 10:
                score += 5
        
        # 匿名性评分 (20分)
        anonymity = results.get('anonymity_level', 'unknown')
        if anonymity == 'elite':
            score += 20
        elif anonymity == 'anonymous':
            score += 15
        elif anonymity == 'transparent':
            score += 5
        
        # 稳定性评分 (20分)
        if 'stability_score' in results and isinstance(results['stability_score'], dict):
            stability_rate = results['stability_score'].get('success_rate', 0)
            score += int(stability_rate * 20)
        
        # 速度评分 (10分)
        if 'speed_metrics' in results:
            small_file = results['speed_metrics'].get('small_file', {})
            if 'speed_kbps' in small_file:
                speed = small_file['speed_kbps']
                if speed > 100:
                    score += 10
                elif speed > 50:
                    score += 7
                elif speed > 20:
                    score += 5
                elif speed > 10:
                    score += 3
        
        return min(score, 100)
    
    def batch_analyze(self, proxy_list, max_workers=10):
        """批量分析代理质量"""
        results = []
        
        print(f"开始深度分析 {len(proxy_list)} 个代理...")
        
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            # 提交所有任务
            future_to_proxy = {
                executor.submit(self.comprehensive_test, proxy): proxy 
                for proxy in proxy_list
            }
            
            # 收集结果
            for i, future in enumerate(as_completed(future_to_proxy), 1):
                proxy = future_to_proxy[future]
                try:
                    result = future.result()
                    results.append(result)
                    
                    print(f"[{i}/{len(proxy_list)}] {proxy}")
                    if result['basic_info'].get('working'):
                        print(f"  ✓ 评分: {result['overall_score']}/100")
                        print(f"  ✓ 匿名性: {result['anonymity_level']}")
                        print(f"  ✓ 位置: {result['geo_info'].get('country', 'Unknown')}")
                    else:
                        print(f"  ✗ 不可用")
                        
                except Exception as e:
                    print(f"[{i}/{len(proxy_list)}] {proxy} - 分析失败: {e}")
        
        return results

# 使用示例
analyzer = AdvancedProxyAnalyzer()

test_proxies = [
    'http://user:pass@proxy1.com:8080',
    'http://user:pass@proxy2.com:8080',
]

detailed_results = analyzer.batch_analyze(test_proxies)

# 按评分排序
detailed_results.sort(key=lambda x: x['overall_score'], reverse=True)

print("\n=== 代理质量排行榜 ===")
for result in detailed_results[:5]:  # 显示前5名
    if result['basic_info'].get('working'):
        print(f"评分: {result['overall_score']}/100 - {result['proxy']}")

实时监控系统

持续监控代理状态

import threading
import time
from datetime import datetime, timedelta
import sqlite3
import json

class ProxyMonitor:
    def __init__(self, db_path='proxy_monitor.db'):
        self.db_path = db_path
        self.monitoring = False
        self.monitor_thread = None
        self.proxy_list = []
        self.check_interval = 300  # 5分钟检查一次
        
        self._init_database()
    
    def _init_database(self):
        """初始化数据库"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS proxy_logs (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                proxy_url TEXT,
                timestamp TEXT,
                status TEXT,
                response_time REAL,
                error_message TEXT,
                ip_address TEXT
            )
        ''')
        
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS proxy_stats (
                proxy_url TEXT PRIMARY KEY,
                total_checks INTEGER DEFAULT 0,
                success_count INTEGER DEFAULT 0,
                avg_response_time REAL DEFAULT 0,
                last_check TEXT,
                last_success TEXT,
                consecutive_failures INTEGER DEFAULT 0,
                uptime_percentage REAL DEFAULT 0
            )
        ''')
        
        conn.commit()
        conn.close()
    
    def add_proxy(self, proxy_url):
        """添加代理到监控列表"""
        if proxy_url not in self.proxy_list:
            self.proxy_list.append(proxy_url)
            
            # 初始化统计记录
            conn = sqlite3.connect(self.db_path)
            cursor = conn.cursor()
            cursor.execute('''
                INSERT OR IGNORE INTO proxy_stats (proxy_url) VALUES (?)
            ''', (proxy_url,))
            conn.commit()
            conn.close()
    
    def remove_proxy(self, proxy_url):
        """从监控列表移除代理"""
        if proxy_url in self.proxy_list:
            self.proxy_list.remove(proxy_url)
    
    def _check_single_proxy(self, proxy_url):
        """检查单个代理状态"""
        timestamp = datetime.now().isoformat()
        
        try:
            start_time = time.time()
            response = requests.get(
                'http://httpbin.org/ip',
                proxies={'http': proxy_url, 'https': proxy_url},
                timeout=15
            )
            end_time = time.time()
            
            response_time = end_time - start_time
            
            if response.status_code == 200:
                ip_data = response.json()
                ip_address = ip_data.get('origin', '').split(',')[0].strip()
                
                self._log_result(proxy_url, timestamp, 'success', response_time, None, ip_address)
                return True, response_time, ip_address
            else:
                self._log_result(proxy_url, timestamp, 'failed', response_time, f'HTTP {response.status_code}', None)
                return False, response_time, None
                
        except Exception as e:
            self._log_result(proxy_url, timestamp, 'error', None, str(e), None)
            return False, None, None
    
    def _log_result(self, proxy_url, timestamp, status, response_time, error_message, ip_address):
        """记录检查结果"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        # 插入日志记录
        cursor.execute('''
            INSERT INTO proxy_logs 
            (proxy_url, timestamp, status, response_time, error_message, ip_address)
            VALUES (?, ?, ?, ?, ?, ?)
        ''', (proxy_url, timestamp, status, response_time, error_message, ip_address))
        
        # 更新统计信息
        cursor.execute('''
            UPDATE proxy_stats 
            SET total_checks = total_checks + 1,
                last_check = ?
            WHERE proxy_url = ?
        ''', (timestamp, proxy_url))
        
        if status == 'success':
            cursor.execute('''
                UPDATE proxy_stats 
                SET success_count = success_count + 1,
                    last_success = ?,
                    consecutive_failures = 0
                WHERE proxy_url = ?
            ''', (timestamp, proxy_url))
        else:
            cursor.execute('''
                UPDATE proxy_stats 
                SET consecutive_failures = consecutive_failures + 1
                WHERE proxy_url = ?
            ''', (proxy_url,))
        
        # 更新平均响应时间和正常运行时间
        if response_time:
            cursor.execute('''
                SELECT avg_response_time, success_count FROM proxy_stats 
                WHERE proxy_url = ?
            ''', (proxy_url,))
            
            result = cursor.fetchone()
            if result:
                old_avg, success_count = result
                if old_avg == 0:
                    new_avg = response_time
                else:
                    new_avg = (old_avg * (success_count - 1) + response_time) / success_count
                
                cursor.execute('''
                    UPDATE proxy_stats 
                    SET avg_response_time = ?
                    WHERE proxy_url = ?
                ''', (new_avg, proxy_url))
        
        # 计算正常运行时间百分比
        cursor.execute('''
            SELECT total_checks, success_count FROM proxy_stats 
            WHERE proxy_url = ?
        ''', (proxy_url,))
        
        result = cursor.fetchone()
        if result:
            total, success = result
            uptime = (success / total * 100) if total > 0 else 0
            cursor.execute('''
                UPDATE proxy_stats 
                SET uptime_percentage = ?
                WHERE proxy_url = ?
            ''', (uptime, proxy_url))
        
        conn.commit()
        conn.close()
    
    def _monitor_loop(self):
        """监控循环"""
        while self.monitoring:
            print(f"\n[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] 开始检查代理状态...")
            
            for proxy in self.proxy_list:
                if not self.monitoring:
                    break
                
                success, response_time, ip = self._check_single_proxy(proxy)
                
                if success:
                    print(f"✓ {proxy} - {response_time:.2f}s - {ip}")
                else:
                    print(f"✗ {proxy} - 检查失败")
                
                time.sleep(1)  # 避免请求过于频繁
            
            if self.monitoring:
                print(f"等待 {self.check_interval} 秒后进行下次检查...")
                time.sleep(self.check_interval)
    
    def start_monitoring(self):
        """开始监控"""
        if self.monitoring:
            print("监控已在运行中")
            return
        
        self.monitoring = True
        self.monitor_thread = threading.Thread(target=self._monitor_loop)
        self.monitor_thread.daemon = True
        self.monitor_thread.start()
        
        print(f"开始监控 {len(self.proxy_list)} 个代理,检查间隔: {self.check_interval}秒")
    
    def stop_monitoring(self):
        """停止监控"""
        self.monitoring = False
        if self.monitor_thread:
            self.monitor_thread.join()
        print("监控已停止")
    
    def get_proxy_stats(self, proxy_url=None):
        """获取代理统计信息"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        if proxy_url:
            cursor.execute('''
                SELECT * FROM proxy_stats WHERE proxy_url = ?
            ''', (proxy_url,))
            result = cursor.fetchone()
        else:
            cursor.execute('''
                SELECT * FROM proxy_stats ORDER BY uptime_percentage DESC
            ''')
            result = cursor.fetchall()
        
        conn.close()
        return result
    
    def get_recent_logs(self, proxy_url=None, hours=24):
        """获取最近的日志记录"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        since_time = (datetime.now() - timedelta(hours=hours)).isoformat()
        
        if proxy_url:
            cursor.execute('''
                SELECT * FROM proxy_logs 
                WHERE proxy_url = ? AND timestamp > ?
                ORDER BY timestamp DESC
            ''', (proxy_url, since_time))
        else:
            cursor.execute('''
                SELECT * FROM proxy_logs 
                WHERE timestamp > ?
                ORDER BY timestamp DESC
            ''', (since_time,))
        
        result = cursor.fetchall()
        conn.close()
        return result
    
    def generate_report(self):
        """生成监控报告"""
        stats = self.get_proxy_stats()
        
        print("\n" + "="*60)
        print("代理监控报告")
        print("="*60)
        
        if not stats:
            print("暂无监控数据")
            return
        
        headers = ['代理地址', '总检查', '成功次数', '正常运行时间', '平均响应时间', '连续失败']
        print(f"{'代理地址':<30} {'正常运行时间':<12} {'平均响应时间':<12} {'连续失败':<8}")
        print("-" * 70)
        
        for stat in stats:
            proxy_url, total, success, avg_time, last_check, last_success, failures, uptime = stat
            
            # 截短代理URL显示
            short_proxy = proxy_url[:27] + "..." if len(proxy_url) > 30 else proxy_url
            
            print(f"{short_proxy:<30} {uptime:>8.1f}% {avg_time:>8.2f}s {failures:>8d}")
        
        # 总体统计
        total_proxies = len(stats)
        healthy_proxies = sum(1 for s in stats if s[7] > 90)  # 正常运行时间>90%
        
        print(f"\n总代理数: {total_proxies}")
        print(f"健康代理: {healthy_proxies} ({healthy_proxies/total_proxies*100:.1f}%)")

# 使用示例
monitor = ProxyMonitor()

# 添加代理到监控
proxies_to_monitor = [
    'http://user:pass@proxy1.com:8080',
    'http://user:pass@proxy2.com:8080',
    'http://user:pass@proxy3.com:8080',
]

for proxy in proxies_to_monitor:
    monitor.add_proxy(proxy)

# 开始监控
monitor.start_monitoring()

# 运行一段时间后查看报告
time.sleep(60)  # 等待1分钟
monitor.generate_report()

# 停止监控
# monitor.stop_monitoring()

代理性能基准测试

压力测试工具

import asyncio
import aiohttp
import time
from concurrent.futures import ThreadPoolExecutor
import statistics

class ProxyBenchmark:
    def __init__(self):
        self.test_urls = [
            'http://httpbin.org/ip',
            'http://httpbin.org/user-agent',
            'http://httpbin.org/headers',
            'https://api.ipify.org?format=json'
        ]
    
    def stress_test_sync(self, proxy_url, concurrent_requests=10, total_requests=100):
        """同步压力测试"""
        print(f"开始压力测试: {proxy_url}")
        print(f"并发数: {concurrent_requests}, 总请求数: {total_requests}")
        
        results = {
            'proxy': proxy_url,
            'concurrent_requests': concurrent_requests,
            'total_requests': total_requests,
            'success_count': 0,
            'failed_count': 0,
            'response_times': [],
            'errors': [],
            'start_time': time.time(),
            'end_time': None
        }
        
        def single_request():
            try:
                start = time.time()
                response = requests.get(
                    'http://httpbin.org/ip',
                    proxies={'http': proxy_url, 'https': proxy_url},
                    timeout=30
                )
                end = time.time()
                
                if response.status_code == 200:
                    return True, end - start, None
                else:
                    return False, end - start, f"HTTP {response.status_code}"
            except Exception as e:
                return False, None, str(e)
        
        # 使用线程池执行并发请求
        with ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
            futures = [executor.submit(single_request) for _ in range(total_requests)]
            
            for future in futures:
                success, response_time, error = future.result()
                
                if success:
                    results['success_count'] += 1
                    if response_time:
                        results['response_times'].append(response_time)
                else:
                    results['failed_count'] += 1
                    if error:
                        results['errors'].append(error)
        
        results['end_time'] = time.time()
        
        # 计算统计信息
        if results['response_times']:
            results['avg_response_time'] = statistics.mean(results['response_times'])
            results['min_response_time'] = min(results['response_times'])
            results['max_response_time'] = max(results['response_times'])
            results['median_response_time'] = statistics.median(results['response_times'])
        
        results['success_rate'] = (results['success_count'] / total_requests) * 100
        results['total_time'] = results['end_time'] - results['start_time']
        results['requests_per_second'] = total_requests / results['total_time']
        
        return results
    
    async def stress_test_async(self, proxy_url, concurrent_requests=10, total_requests=100):
        """异步压力测试(更高效)"""
        print(f"开始异步压力测试: {proxy_url}")
        
        results = {
            'proxy': proxy_url,
            'concurrent_requests': concurrent_requests,
            'total_requests': total_requests,
            'success_count': 0,
            'failed_count': 0,
            'response_times': [],
            'errors': [],
            'start_time': time.time(),
            'end_time': None
        }
        
        # 解析代理URL
        proxy_parts = proxy_url.replace('http://', '').split('@')
        if len(proxy_parts) == 2:
            auth, host_port = proxy_parts
            username, password = auth.split(':')
            host, port = host_port.split(':')
            proxy_auth = aiohttp.BasicAuth(username, password)
            proxy_url_clean = f'http://{host}:{port}'
        else:
            proxy_auth = None
            proxy_url_clean = proxy_url
        
        async def single_request(session, semaphore):
            async with semaphore:
                try:
                    start = time.time()
                    async with session.get(
                        'http://httpbin.org/ip',
                        proxy=proxy_url_clean,
                        proxy_auth=proxy_auth,
                        timeout=aiohttp.ClientTimeout(total=30)
                    ) as response:
                        end = time.time()
                        await response.text()  # 读取响应内容
                        
                        if response.status == 200:
                            return True, end - start, None
                        else:
                            return False, end - start, f"HTTP {response.status}"
                            
                except Exception as e:
                    return False, None, str(e)
        
        # 创建信号量控制并发数
        semaphore = asyncio.Semaphore(concurrent_requests)
        
        async with aiohttp.ClientSession() as session:
            tasks = [single_request(session, semaphore) for _ in range(total_requests)]
            responses = await asyncio.gather(*tasks)
            
            for success, response_time, error in responses:
                if success:
                    results['success_count'] += 1
                    if response_time:
                        results['response_times'].append(response_time)
                else:
                    results['failed_count'] += 1
                    if error:
                        results['errors'].append(error)
        
        results['end_time'] = time.time()
        
        # 计算统计信息
        if results['response_times']:
            results['avg_response_time'] = statistics.mean(results['response_times'])
            results['min_response_time'] = min(results['response_times'])
            results['max_response_time'] = max(results['response_times'])
            results['median_response_time'] = statistics.median(results['response_times'])
        
        results['success_rate'] = (results['success_count'] / total_requests) * 100
        results['total_time'] = results['end_time'] - results['start_time']
        results['requests_per_second'] = total_requests / results['total_time']
        
        return results
    
    def print_benchmark_results(self, results):
        """打印基准测试结果"""
        print(f"\n{'='*60}")
        print(f"压力测试结果: {results['proxy']}")
        print(f"{'='*60}")
        print(f"总请求数: {results['total_requests']}")
        print(f"并发数: {results['concurrent_requests']}")
        print(f"成功请求: {results['success_count']}")
        print(f"失败请求: {results['failed_count']}")
        print(f"成功率: {results['success_rate']:.2f}%")
        print(f"总耗时: {results['total_time']:.2f}秒")
        print(f"请求速率: {results['requests_per_second']:.2f} 请求/秒")
        
        if results['response_times']:
            print(f"\n响应时间统计:")
            print(f"  平均: {results['avg_response_time']:.3f}秒")
            print(f"  最小: {results['min_response_time']:.3f}秒")
            print(f"  最大: {results['max_response_time']:.3f}秒")
            print(f"  中位数: {results['median_response_time']:.3f}秒")
        
        if results['errors']:
            print(f"\n错误统计:")
            error_counts = {}
            for error in results['errors']:
                error_counts[error] = error_counts.get(error, 0) + 1
            
            for error, count in error_counts.items():
                print(f"  {error}: {count}次")
    
    def compare_proxies(self, proxy_list, concurrent_requests=5, total_requests=50):
        """对比多个代理的性能"""
        print(f"开始对比 {len(proxy_list)} 个代理的性能...")
        
        all_results = []
        
        for i, proxy in enumerate(proxy_list, 1):
            print(f"\n[{i}/{len(proxy_list)}] 测试代理: {proxy}")
            
            # 使用异步测试以提高效率
            result = asyncio.run(self.stress_test_async(
                proxy, concurrent_requests, total_requests
            ))
            
            all_results.append(result)
            self.print_benchmark_results(result)
        
        # 生成对比报告
        self._generate_comparison_report(all_results)
        
        return all_results
    
    def _generate_comparison_report(self, results):
        """生成对比报告"""
        print(f"\n{'='*80}")
        print("代理性能对比报告")
        print(f"{'='*80}")
        
        # 按成功率排序
        results.sort(key=lambda x: x['success_rate'], reverse=True)
        
        print(f"{'排名':<4} {'代理':<35} {'成功率':<8} {'平均响应时间':<12} {'请求速率':<12}")
        print("-" * 80)
        
        for i, result in enumerate(results, 1):
            proxy_short = result['proxy'][:32] + "..." if len(result['proxy']) > 35 else result['proxy']
            success_rate = f"{result['success_rate']:.1f}%"
            avg_time = f"{result.get('avg_response_time', 0):.3f}s" if result.get('avg_response_time') else "N/A"
            rps = f"{result['requests_per_second']:.1f}/s"
            
            print(f"{i:<4} {proxy_short:<35} {success_rate:<8} {avg_time:<12} {rps:<12}")
        
        # 推荐最佳代理
        if results:
            best_proxy = results[0]
            print(f"\n🏆 推荐代理: {best_proxy['proxy']}")
            print(f"   成功率: {best_proxy['success_rate']:.1f}%")
            print(f"   平均响应时间: {best_proxy.get('avg_response_time', 0):.3f}秒")

# 使用示例
benchmark = ProxyBenchmark()

test_proxies = [
    'http://user:pass@proxy1.com:8080',
    'http://user:pass@proxy2.com:8080',
    'http://user:pass@proxy3.com:8080',
]

# 对比测试
comparison_results = benchmark.compare_proxies(
    test_proxies, 
    concurrent_requests=3, 
    total_requests=30
)

# 单独压力测试
# single_result = asyncio.run(benchmark.stress_test_async(
#     'http://user:pass@proxy1.com:8080',
#     concurrent_requests=10,
#     total_requests=100
# ))
# benchmark.print_benchmark_results(single_result)

代理健康度评分系统

综合评分算法

import math
from datetime import datetime, timedelta

class ProxyHealthScorer:
    def __init__(self):
        self.weights = {
            'availability': 0.3,      # 可用性权重
            'speed': 0.25,           # 速度权重
            'stability': 0.25,       # 稳定性权重
            'anonymity': 0.15,       # 匿名性权重
            'geographic': 0.05       # 地理位置权重
        }
    
    def calculate_health_score(self, proxy_data):
        """计算代理健康度评分"""
        scores = {}
        
        # 可用性评分 (0-100)
        scores['availability'] = self._score_availability(proxy_data)
        
        # 速度评分 (0-100)
        scores['speed'] = self._score_speed(proxy_data)
        
        # 稳定性评分 (0-100)
        scores['stability'] = self._score_stability(proxy_data)
        
        # 匿名性评分 (0-100)
        scores['anonymity'] = self._score_anonymity(proxy_data)
        
        # 地理位置评分 (0-100)
        scores['geographic'] = self._score_geographic(proxy_data)
        
        # 计算加权总分
        total_score = sum(
            scores[metric] * self.weights[metric] 
            for metric in scores
        )
        
        return {
            'total_score': round(total_score, 2),
            'detailed_scores': scores,
            'grade': self._get_grade(total_score),
            'recommendations': self._get_recommendations(scores)
        }
    
    def _score_availability(self, data):
        """可用性评分"""
        # 基于最近的成功率
        recent_success_rate = data.get('recent_success_rate', 0)
        
        # 基于连续失败次数的惩罚
        consecutive_failures = data.get('consecutive_failures', 0)
        failure_penalty = min(consecutive_failures * 10, 50)
        
        # 基于总体正常运行时间
        uptime = data.get('uptime_percentage', 0)
        
        base_score = (recent_success_rate + uptime) / 2
        final_score = max(0, base_score - failure_penalty)
        
        return min(100, final_score)
    
    def _score_speed(self, data):
        """速度评分"""
        avg_response_time = data.get('avg_response_time', 999)
        
        if avg_response_time == 0 or avg_response_time > 30:
            return 0
        
        # 响应时间评分曲线
        if avg_response_time <= 1:
            return 100
        elif avg_response_time <= 2:
            return 90
        elif avg_response_time <= 3:
            return 80
        elif avg_response_time <= 5:
            return 70
        elif avg_response_time <= 8:
            return 60
        elif avg_response_time <= 12:
            return 50
        elif avg_response_time <= 20:
            return 30
        else:
            return 10
    
    def _score_stability(self, data):
        """稳定性评分"""
        # 基于响应时间的稳定性
        response_time_std = data.get('response_time_std', 0)
        stability_score = max(0, 100 - response_time_std * 20)
        
        # 基于服务中断频率
        downtime_events = data.get('downtime_events_24h', 0)
        downtime_penalty = min(downtime_events * 15, 60)
        
        # 基于长期表现
        long_term_uptime = data.get('uptime_7d', 0)
        
        final_score = (stability_score + long_term_uptime) / 2 - downtime_penalty
        
        return max(0, min(100, final_score))
    
    def _score_anonymity(self, data):
        """匿名性评分"""
        anonymity_level = data.get('anonymity_level', 'unknown')
        
        anonymity_scores = {
            'elite': 100,
            'anonymous': 75,
            'transparent': 25,
            'unknown': 0
        }
        
        base_score = anonymity_scores.get(anonymity_level, 0)
        
        # 检查是否有IP泄露历史
        ip_leak_incidents = data.get('ip_leak_incidents', 0)
        leak_penalty = min(ip_leak_incidents * 20, 50)
        
        return max(0, base_score - leak_penalty)
    
    def _score_geographic(self, data):
        """地理位置评分"""
        # 基于目标地区匹配度
        target_country = data.get('target_country')
        proxy_country = data.get('proxy_country')
        
        if target_country and proxy_country:
            if target_country.lower() == proxy_country.lower():
                return 100
            else:
                return 50
        
        # 基于IP类型
        is_datacenter = data.get('is_datacenter', True)
        if is_datacenter:
            return 60  # 数据中心IP评分较低
        else:
            return 90  # 住宅IP评分较高
    
    def _get_grade(self, score):
        """根据评分获取等级"""
        if score >= 90:
            return 'A+'
        elif score >= 80:
            return 'A'
        elif score >= 70:
            return 'B+'
        elif score >= 60:
            return 'B'
        elif score >= 50:
            return 'C'
        else:
            return 'D'
    
    def _get_recommendations(self, scores):
        """根据评分提供改进建议"""
        recommendations = []
        
        if scores['availability'] < 70:
            recommendations.append("可用性较低,建议更换代理服务商或检查网络配置")
        
        if scores['speed'] < 60:
            recommendations.append("响应速度较慢,考虑选择地理位置更近的代理")
        
        if scores['stability'] < 70:
            recommendations.append("稳定性不佳,建议增加代理轮换或使用备用代理")
        
        if scores['anonymity'] < 80:
            recommendations.append("匿名性不足,建议使用高匿名代理或住宅IP")
        
        if scores['geographic'] < 80:
            recommendations.append("地理位置不匹配,建议选择目标地区的代理")
        
        return recommendations
    
    def batch_score_proxies(self, proxy_data_list):
        """批量评分代理"""
        results = []
        
        for proxy_data in proxy_data_list:
            score_result = self.calculate_health_score(proxy_data)
            score_result['proxy_url'] = proxy_data.get('proxy_url', 'Unknown')
            results.append(score_result)
        
        # 按总分排序
        results.sort(key=lambda x: x['total_score'], reverse=True)
        
        return results
    
    def generate_health_report(self, scored_proxies):
        """生成健康度报告"""
        print(f"\n{'='*80}")
        print("代理健康度评估报告")
        print(f"{'='*80}")
        
        if not scored_proxies:
            print("暂无代理数据")
            return
        
        # 总体统计
        total_proxies = len(scored_proxies)
        avg_score = sum(p['total_score'] for p in scored_proxies) / total_proxies
        
        grade_counts = {}
        for proxy in scored_proxies:
            grade = proxy['grade']
            grade_counts[grade] = grade_counts.get(grade, 0) + 1
        
        print(f"总代理数: {total_proxies}")
        print(f"平均评分: {avg_score:.2f}")
        print(f"等级分布: {dict(sorted(grade_counts.items()))}")
        
        # 详细列表
        print(f"\n{'排名':<4} {'等级':<4} {'评分':<6} {'代理地址':<40} {'主要问题'}")
        print("-" * 80)
        
        for i, proxy in enumerate(scored_proxies[:10], 1):  # 显示前10名
            proxy_short = proxy['proxy_url'][:37] + "..." if len(proxy['proxy_url']) > 40 else proxy['proxy_url']
            
            # 找出最低分的指标作为主要问题
            detailed = proxy['detailed_scores']
            min_metric = min(detailed.keys(), key=lambda k: detailed[k])
            main_issue = f"{min_metric}({detailed[min_metric]:.0f})"
            
            print(f"{i:<4} {proxy['grade']:<4} {proxy['total_score']:<6.1f} {proxy_short:<40} {main_issue}")
        
        # 推荐和警告
        excellent_proxies = [p for p in scored_proxies if p['total_score'] >= 90]
        poor_proxies = [p for p in scored_proxies if p['total_score'] < 50]
        
        if excellent_proxies:
            print(f"\n🏆 优秀代理 ({len(excellent_proxies)}个):")
            for proxy in excellent_proxies[:3]:
                print(f"   {proxy['proxy_url']} (评分: {proxy['total_score']})")
        
        if poor_proxies:
            print(f"\n⚠️  需要关注的代理 ({len(poor_proxies)}个):")
            for proxy in poor_proxies[:3]:
                print(f"   {proxy['proxy_url']} (评分: {proxy['total_score']})")
                if proxy['recommendations']:
                    print(f"     建议: {proxy['recommendations'][0]}")

# 使用示例
scorer = ProxyHealthScorer()

# 模拟代理数据
sample_proxy_data = [
    {
        'proxy_url': 'http://user:pass@proxy1.com:8080',
        'recent_success_rate': 95,
        'consecutive_failures': 0,
        'uptime_percentage': 98,
        'avg_response_time': 1.2,
        'response_time_std': 0.3,
        'downtime_events_24h': 1,
        'uptime_7d': 96,
        'anonymity_level': 'elite',
        'ip_leak_incidents': 0,
        'is_datacenter': False,
        'proxy_country': 'US',
        'target_country': 'US'
    },
    {
        'proxy_url': 'http://user:pass@proxy2.com:8080',
        'recent_success_rate': 75,
        'consecutive_failures': 2,
        'uptime_percentage': 85,
        'avg_response_time': 3.5,
        'response_time_std': 1.2,
        'downtime_events_24h': 3,
        'uptime_7d': 80,
        'anonymity_level': 'anonymous',
        'ip_leak_incidents': 1,
        'is_datacenter': True,
        'proxy_country': 'DE',
        'target_country': 'US'
    }
]

# 批量评分
scored_results = scorer.batch_score_proxies(sample_proxy_data)

# 生成报告
scorer.generate_health_report(scored_results)

# 查看单个代理的详细评分
for result in scored_results:
    print(f"\n代理: {result['proxy_url']}")
    print(f"总评分: {result['total_score']} ({result['grade']})")
    print("详细评分:")
    for metric, score in result['detailed_scores'].items():
        print(f"  {metric}: {score:.1f}")
    if result['recommendations']:
        print("改进建议:")
        for rec in result['recommendations']:
            print(f"  - {rec}")

实用建议

1. 检测频率建议

# 根据使用场景调整检测频率
detection_schedules = {
    'production': {
        'basic_check': '每5分钟',
        'deep_analysis': '每小时',
        'benchmark': '每天',
        'health_score': '每6小时'
    },
    'development': {
        'basic_check': '每15分钟',
        'deep_analysis': '每4小时',
        'benchmark': '每周',
        'health_score': '每天'
    },
    'testing': {
        'basic_check': '每次使用前',
        'deep_analysis': '每次使用前',
        'benchmark': '购买前',
        'health_score': '购买前'
    }
}

2. 自动化代理管理

class AutoProxyManager:
    def __init__(self):
        self.active_proxies = []
        self.backup_proxies = []
        self.blacklisted_proxies = []
        
    def auto_rotate_bad_proxies(self):
        """自动轮换表现不佳的代理"""
        for proxy in self.active_proxies[:]:
            health_score = self.get_proxy_health_score(proxy)
            
            if health_score < 50:
                print(f"代理 {proxy} 表现不佳,移至备用列表")
                self.active_proxies.remove(proxy)
                self.backup_proxies.append(proxy)
                
                # 从备用列表提升一个代理
                if self.backup_proxies:
                    promoted = self.backup_proxies.pop(0)
                    self.active_proxies.append(promoted)
                    print(f"提升代理 {promoted} 到活跃列表")
    
    def get_proxy_health_score(self, proxy):
        # 这里调用前面的健康度评分系统
        pass

3. 成本效益分析

def analyze_proxy_cost_effectiveness(proxy_results, monthly_cost):
    """分析代理的成本效益"""
    
    for result in proxy_results:
        success_rate = result.get('success_rate', 0)
        avg_response_time = result.get('avg_response_time', 999)
        
        # 计算有效性价比
        if success_rate > 0:
            cost_per_success = monthly_cost / (success_rate / 100)
            efficiency_score = (success_rate / 100) / (avg_response_time + 1)
            
            print(f"代理: {result['proxy']}")
            print(f"  成功率: {success_rate:.1f}%")
            print(f"  每成功请求成本: ${cost_per_success:.2f}")
            print(f"  效率评分: {efficiency_score:.3f}")

总结

代理IP质量检测是一个系统性工程,需要从多个维度进行评估:

核心检测指标:

  • 可用性:最基础的指标,决定代理能否正常工作
  • 速度:影响爬虫效率的关键因素
  • 稳定性:决定服务的可靠性
  • 匿名性:影响反爬虫效果
  • 地理位置:影响目标网站的访问效果

检测工具选择:

  • 基础检测:适合日常快速验证
  • 深度分析:适合购买前评估
  • 压力测试:适合高并发场景验证
  • 持续监控:适合生产环境使用

实用建议:

  1. 建立分层检测体系:基础检测→深度分析→持续监控
  2. 设置合理的评分标准:根据业务需求调整权重
  3. 自动化管理:减少人工干预,提高效率
  4. 成本控制:平衡质量和成本,选择性价比最高的方案

记住,没有完美的代理,只有最适合的代理。根据你的具体需求选择检测方法,定期评估和优化代理池,才能保证爬虫项目的稳定运行。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值