前言
买了代理IP却不知道质量如何?用着用着突然就不行了?这些问题相信做爬虫的朋友都遇到过。
代理IP的质量直接影响爬虫的成功率,而稳定性则关系到项目能否持续运行。今天分享一套完整的代理IP检测方案,帮你筛选出真正好用的代理。
代理IP质量评估指标
核心指标
1. 可用性(Availability)
- 能否正常连接
- HTTP状态码是否正确
- 响应内容是否完整
2. 速度(Speed)
- 连接建立时间
- 响应时间
- 下载速度
3. 匿名性(Anonymity)
- 是否暴露真实IP
- 是否有代理特征
- 匿名等级
4. 稳定性(Stability)
- 连续可用时间
- 失败率趋势
- 服务中断频率
5. 地理位置(Location)
- IP归属地
- 与目标服务器距离
- 时区匹配度
基础检测工具
简单可用性检测
import requests
import time
from datetime import datetime
class BasicProxyChecker:
def __init__(self):
self.test_urls = [
'http://httpbin.org/ip',
'https://api.ipify.org?format=json',
'http://icanhazip.com'
]
def check_proxy(self, proxy_url, timeout=10):
"""基础代理检测"""
result = {
'proxy': proxy_url,
'timestamp': datetime.now().isoformat(),
'working': False,
'response_time': None,
'ip_address': None,
'error': None
}
try:
start_time = time.time()
response = requests.get(
self.test_urls[0],
proxies={
'http': proxy_url,
'https': proxy_url
},
timeout=timeout,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
)
end_time = time.time()
if response.status_code == 200:
result['working'] = True
result['response_time'] = end_time - start_time
# 提取IP地址
try:
ip_data = response.json()
result['ip_address'] = ip_data.get('origin', '').split(',')[0].strip()
except:
result['ip_address'] = response.text.strip()
except Exception as e:
result['error'] = str(e)
return result
def quick_test(self, proxy_list):
"""快速批量测试"""
results = []
print(f"开始测试 {len(proxy_list)} 个代理...")
for i, proxy in enumerate(proxy_list, 1):
print(f"[{i}/{len(proxy_list)}] 测试: {proxy}")
result = self.check_proxy(proxy)
results.append(result)
if result['working']:
print(f"✓ 可用 - IP: {result['ip_address']} - 响应时间: {result['response_time']:.2f}s")
else:
print(f"✗ 不可用 - {result['error']}")
# 统计结果
working_count = sum(1 for r in results if r['working'])
print(f"\n测试完成: {working_count}/{len(proxy_list)} 个代理可用")
return results
# 使用示例
checker = BasicProxyChecker()
test_proxies = [
'http://username:password@proxy1.com:8080',
'http://username:password@proxy2.com:8080',
'http://username:password@proxy3.com:8080',
]
results = checker.quick_test(test_proxies)
深度质量检测
import requests
import time
import json
import socket
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
class AdvancedProxyAnalyzer:
def __init__(self):
self.test_endpoints = {
'ip_check': 'http://httpbin.org/ip',
'headers_check': 'http://httpbin.org/headers',
'geo_check': 'http://ip-api.com/json',
'speed_test': 'http://httpbin.org/bytes/1024', # 1KB数据
'large_speed_test': 'http://httpbin.org/bytes/102400' # 100KB数据
}
def comprehensive_test(self, proxy_url):
"""全面的代理质量检测"""
results = {
'proxy': proxy_url,
'timestamp': datetime.now().isoformat(),
'basic_info': {},
'speed_metrics': {},
'anonymity_level': 'unknown',
'geo_info': {},
'stability_score': 0,
'overall_score': 0
}
try:
# 基础信息检测
results['basic_info'] = self._test_basic_connectivity(proxy_url)
if results['basic_info']['working']:
# 速度测试
results['speed_metrics'] = self._test_speed(proxy_url)
# 匿名性测试
results['anonymity_level'] = self._test_anonymity(proxy_url)
# 地理位置信息
results['geo_info'] = self._get_geo_info(proxy_url)
# 稳定性测试
results['stability_score'] = self._test_stability(proxy_url)
# 计算综合评分
results['overall_score'] = self._calculate_score(results)
except Exception as e:
results['error'] = str(e)
return results
def _test_basic_connectivity(self, proxy_url):
"""基础连通性测试"""
try:
start_time = time.time()
response = requests.get(
self.test_endpoints['ip_check'],
proxies={'http': proxy_url, 'https': proxy_url},
timeout=15
)
end_time = time.time()
if response.status_code == 200:
ip_data = response.json()
return {
'working': True,
'response_time': end_time - start_time,
'ip_address': ip_data.get('origin', '').split(',')[0].strip(),
'status_code': response.status_code
}
except Exception as e:
return {
'working': False,
'error': str(e)
}
def _test_speed(self, proxy_url):
"""速度性能测试"""
speed_results = {}
# 小文件下载速度
try:
start_time = time.time()
response = requests.get(
self.test_endpoints['speed_test'],
proxies={'http': proxy_url, 'https': proxy_url},
timeout=30
)
end_time = time.time()
if response.status_code == 200:
download_time = end_time - start_time
file_size_kb = len(response.content) / 1024
speed_results['small_file'] = {
'download_time': download_time,
'speed_kbps': file_size_kb / download_time if download_time > 0 else 0
}
except:
speed_results['small_file'] = {'error': 'Failed'}
# 大文件下载速度
try:
start_time = time.time()
response = requests.get(
self.test_endpoints['large_speed_test'],
proxies={'http': proxy_url, 'https': proxy_url},
timeout=60
)
end_time = time.time()
if response.status_code == 200:
download_time = end_time - start_time
file_size_kb = len(response.content) / 1024
speed_results['large_file'] = {
'download_time': download_time,
'speed_kbps': file_size_kb / download_time if download_time > 0 else 0
}
except:
speed_results['large_file'] = {'error': 'Failed'}
return speed_results
def _test_anonymity(self, proxy_url):
"""匿名性等级测试"""
try:
# 获取原始IP
original_response = requests.get(
self.test_endpoints['ip_check'],
timeout=10
)
original_ip = original_response.json()['origin']
# 通过代理获取headers
proxy_response = requests.get(
self.test_endpoints['headers_check'],
proxies={'http': proxy_url, 'https': proxy_url},
timeout=15
)
if proxy_response.status_code != 200:
return 'unknown'
headers = proxy_response.json()['headers']
headers_str = json.dumps(headers).lower()
# 检查是否暴露真实IP
if original_ip in headers_str:
return 'transparent'
# 检查代理特征headers
proxy_indicators = [
'x-forwarded-for', 'x-real-ip', 'via', 'x-proxy',
'forwarded', 'x-forwarded', 'proxy-connection'
]
if any(indicator in headers_str for indicator in proxy_indicators):
return 'anonymous'
return 'elite'
except:
return 'unknown'
def _get_geo_info(self, proxy_url):
"""获取地理位置信息"""
try:
response = requests.get(
self.test_endpoints['geo_check'],
proxies={'http': proxy_url, 'https': proxy_url},
timeout=15
)
if response.status_code == 200:
data = response.json()
return {
'country': data.get('country'),
'country_code': data.get('countryCode'),
'region': data.get('regionName'),
'city': data.get('city'),
'isp': data.get('isp'),
'org': data.get('org'),
'timezone': data.get('timezone'),
'is_datacenter': data.get('hosting', False)
}
except:
pass
return {}
def _test_stability(self, proxy_url, test_count=5):
"""稳定性测试"""
success_count = 0
total_response_time = 0
for i in range(test_count):
try:
start_time = time.time()
response = requests.get(
self.test_endpoints['ip_check'],
proxies={'http': proxy_url, 'https': proxy_url},
timeout=10
)
end_time = time.time()
if response.status_code == 200:
success_count += 1
total_response_time += (end_time - start_time)
time.sleep(1) # 间隔1秒
except:
pass
stability_rate = success_count / test_count
avg_response_time = total_response_time / success_count if success_count > 0 else 0
return {
'success_rate': stability_rate,
'avg_response_time': avg_response_time,
'test_count': test_count
}
def _calculate_score(self, results):
"""计算综合评分(0-100)"""
score = 0
# 基础可用性 (30分)
if results['basic_info'].get('working'):
score += 30
# 响应时间评分 (20分)
response_time = results['basic_info'].get('response_time', 999)
if response_time < 1:
score += 20
elif response_time < 3:
score += 15
elif response_time < 5:
score += 10
elif response_time < 10:
score += 5
# 匿名性评分 (20分)
anonymity = results.get('anonymity_level', 'unknown')
if anonymity == 'elite':
score += 20
elif anonymity == 'anonymous':
score += 15
elif anonymity == 'transparent':
score += 5
# 稳定性评分 (20分)
if 'stability_score' in results and isinstance(results['stability_score'], dict):
stability_rate = results['stability_score'].get('success_rate', 0)
score += int(stability_rate * 20)
# 速度评分 (10分)
if 'speed_metrics' in results:
small_file = results['speed_metrics'].get('small_file', {})
if 'speed_kbps' in small_file:
speed = small_file['speed_kbps']
if speed > 100:
score += 10
elif speed > 50:
score += 7
elif speed > 20:
score += 5
elif speed > 10:
score += 3
return min(score, 100)
def batch_analyze(self, proxy_list, max_workers=10):
"""批量分析代理质量"""
results = []
print(f"开始深度分析 {len(proxy_list)} 个代理...")
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# 提交所有任务
future_to_proxy = {
executor.submit(self.comprehensive_test, proxy): proxy
for proxy in proxy_list
}
# 收集结果
for i, future in enumerate(as_completed(future_to_proxy), 1):
proxy = future_to_proxy[future]
try:
result = future.result()
results.append(result)
print(f"[{i}/{len(proxy_list)}] {proxy}")
if result['basic_info'].get('working'):
print(f" ✓ 评分: {result['overall_score']}/100")
print(f" ✓ 匿名性: {result['anonymity_level']}")
print(f" ✓ 位置: {result['geo_info'].get('country', 'Unknown')}")
else:
print(f" ✗ 不可用")
except Exception as e:
print(f"[{i}/{len(proxy_list)}] {proxy} - 分析失败: {e}")
return results
# 使用示例
analyzer = AdvancedProxyAnalyzer()
test_proxies = [
'http://user:pass@proxy1.com:8080',
'http://user:pass@proxy2.com:8080',
]
detailed_results = analyzer.batch_analyze(test_proxies)
# 按评分排序
detailed_results.sort(key=lambda x: x['overall_score'], reverse=True)
print("\n=== 代理质量排行榜 ===")
for result in detailed_results[:5]: # 显示前5名
if result['basic_info'].get('working'):
print(f"评分: {result['overall_score']}/100 - {result['proxy']}")
实时监控系统
持续监控代理状态
import threading
import time
from datetime import datetime, timedelta
import sqlite3
import json
class ProxyMonitor:
def __init__(self, db_path='proxy_monitor.db'):
self.db_path = db_path
self.monitoring = False
self.monitor_thread = None
self.proxy_list = []
self.check_interval = 300 # 5分钟检查一次
self._init_database()
def _init_database(self):
"""初始化数据库"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS proxy_logs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
proxy_url TEXT,
timestamp TEXT,
status TEXT,
response_time REAL,
error_message TEXT,
ip_address TEXT
)
''')
cursor.execute('''
CREATE TABLE IF NOT EXISTS proxy_stats (
proxy_url TEXT PRIMARY KEY,
total_checks INTEGER DEFAULT 0,
success_count INTEGER DEFAULT 0,
avg_response_time REAL DEFAULT 0,
last_check TEXT,
last_success TEXT,
consecutive_failures INTEGER DEFAULT 0,
uptime_percentage REAL DEFAULT 0
)
''')
conn.commit()
conn.close()
def add_proxy(self, proxy_url):
"""添加代理到监控列表"""
if proxy_url not in self.proxy_list:
self.proxy_list.append(proxy_url)
# 初始化统计记录
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute('''
INSERT OR IGNORE INTO proxy_stats (proxy_url) VALUES (?)
''', (proxy_url,))
conn.commit()
conn.close()
def remove_proxy(self, proxy_url):
"""从监控列表移除代理"""
if proxy_url in self.proxy_list:
self.proxy_list.remove(proxy_url)
def _check_single_proxy(self, proxy_url):
"""检查单个代理状态"""
timestamp = datetime.now().isoformat()
try:
start_time = time.time()
response = requests.get(
'http://httpbin.org/ip',
proxies={'http': proxy_url, 'https': proxy_url},
timeout=15
)
end_time = time.time()
response_time = end_time - start_time
if response.status_code == 200:
ip_data = response.json()
ip_address = ip_data.get('origin', '').split(',')[0].strip()
self._log_result(proxy_url, timestamp, 'success', response_time, None, ip_address)
return True, response_time, ip_address
else:
self._log_result(proxy_url, timestamp, 'failed', response_time, f'HTTP {response.status_code}', None)
return False, response_time, None
except Exception as e:
self._log_result(proxy_url, timestamp, 'error', None, str(e), None)
return False, None, None
def _log_result(self, proxy_url, timestamp, status, response_time, error_message, ip_address):
"""记录检查结果"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
# 插入日志记录
cursor.execute('''
INSERT INTO proxy_logs
(proxy_url, timestamp, status, response_time, error_message, ip_address)
VALUES (?, ?, ?, ?, ?, ?)
''', (proxy_url, timestamp, status, response_time, error_message, ip_address))
# 更新统计信息
cursor.execute('''
UPDATE proxy_stats
SET total_checks = total_checks + 1,
last_check = ?
WHERE proxy_url = ?
''', (timestamp, proxy_url))
if status == 'success':
cursor.execute('''
UPDATE proxy_stats
SET success_count = success_count + 1,
last_success = ?,
consecutive_failures = 0
WHERE proxy_url = ?
''', (timestamp, proxy_url))
else:
cursor.execute('''
UPDATE proxy_stats
SET consecutive_failures = consecutive_failures + 1
WHERE proxy_url = ?
''', (proxy_url,))
# 更新平均响应时间和正常运行时间
if response_time:
cursor.execute('''
SELECT avg_response_time, success_count FROM proxy_stats
WHERE proxy_url = ?
''', (proxy_url,))
result = cursor.fetchone()
if result:
old_avg, success_count = result
if old_avg == 0:
new_avg = response_time
else:
new_avg = (old_avg * (success_count - 1) + response_time) / success_count
cursor.execute('''
UPDATE proxy_stats
SET avg_response_time = ?
WHERE proxy_url = ?
''', (new_avg, proxy_url))
# 计算正常运行时间百分比
cursor.execute('''
SELECT total_checks, success_count FROM proxy_stats
WHERE proxy_url = ?
''', (proxy_url,))
result = cursor.fetchone()
if result:
total, success = result
uptime = (success / total * 100) if total > 0 else 0
cursor.execute('''
UPDATE proxy_stats
SET uptime_percentage = ?
WHERE proxy_url = ?
''', (uptime, proxy_url))
conn.commit()
conn.close()
def _monitor_loop(self):
"""监控循环"""
while self.monitoring:
print(f"\n[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] 开始检查代理状态...")
for proxy in self.proxy_list:
if not self.monitoring:
break
success, response_time, ip = self._check_single_proxy(proxy)
if success:
print(f"✓ {proxy} - {response_time:.2f}s - {ip}")
else:
print(f"✗ {proxy} - 检查失败")
time.sleep(1) # 避免请求过于频繁
if self.monitoring:
print(f"等待 {self.check_interval} 秒后进行下次检查...")
time.sleep(self.check_interval)
def start_monitoring(self):
"""开始监控"""
if self.monitoring:
print("监控已在运行中")
return
self.monitoring = True
self.monitor_thread = threading.Thread(target=self._monitor_loop)
self.monitor_thread.daemon = True
self.monitor_thread.start()
print(f"开始监控 {len(self.proxy_list)} 个代理,检查间隔: {self.check_interval}秒")
def stop_monitoring(self):
"""停止监控"""
self.monitoring = False
if self.monitor_thread:
self.monitor_thread.join()
print("监控已停止")
def get_proxy_stats(self, proxy_url=None):
"""获取代理统计信息"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
if proxy_url:
cursor.execute('''
SELECT * FROM proxy_stats WHERE proxy_url = ?
''', (proxy_url,))
result = cursor.fetchone()
else:
cursor.execute('''
SELECT * FROM proxy_stats ORDER BY uptime_percentage DESC
''')
result = cursor.fetchall()
conn.close()
return result
def get_recent_logs(self, proxy_url=None, hours=24):
"""获取最近的日志记录"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
since_time = (datetime.now() - timedelta(hours=hours)).isoformat()
if proxy_url:
cursor.execute('''
SELECT * FROM proxy_logs
WHERE proxy_url = ? AND timestamp > ?
ORDER BY timestamp DESC
''', (proxy_url, since_time))
else:
cursor.execute('''
SELECT * FROM proxy_logs
WHERE timestamp > ?
ORDER BY timestamp DESC
''', (since_time,))
result = cursor.fetchall()
conn.close()
return result
def generate_report(self):
"""生成监控报告"""
stats = self.get_proxy_stats()
print("\n" + "="*60)
print("代理监控报告")
print("="*60)
if not stats:
print("暂无监控数据")
return
headers = ['代理地址', '总检查', '成功次数', '正常运行时间', '平均响应时间', '连续失败']
print(f"{'代理地址':<30} {'正常运行时间':<12} {'平均响应时间':<12} {'连续失败':<8}")
print("-" * 70)
for stat in stats:
proxy_url, total, success, avg_time, last_check, last_success, failures, uptime = stat
# 截短代理URL显示
short_proxy = proxy_url[:27] + "..." if len(proxy_url) > 30 else proxy_url
print(f"{short_proxy:<30} {uptime:>8.1f}% {avg_time:>8.2f}s {failures:>8d}")
# 总体统计
total_proxies = len(stats)
healthy_proxies = sum(1 for s in stats if s[7] > 90) # 正常运行时间>90%
print(f"\n总代理数: {total_proxies}")
print(f"健康代理: {healthy_proxies} ({healthy_proxies/total_proxies*100:.1f}%)")
# 使用示例
monitor = ProxyMonitor()
# 添加代理到监控
proxies_to_monitor = [
'http://user:pass@proxy1.com:8080',
'http://user:pass@proxy2.com:8080',
'http://user:pass@proxy3.com:8080',
]
for proxy in proxies_to_monitor:
monitor.add_proxy(proxy)
# 开始监控
monitor.start_monitoring()
# 运行一段时间后查看报告
time.sleep(60) # 等待1分钟
monitor.generate_report()
# 停止监控
# monitor.stop_monitoring()
代理性能基准测试
压力测试工具
import asyncio
import aiohttp
import time
from concurrent.futures import ThreadPoolExecutor
import statistics
class ProxyBenchmark:
def __init__(self):
self.test_urls = [
'http://httpbin.org/ip',
'http://httpbin.org/user-agent',
'http://httpbin.org/headers',
'https://api.ipify.org?format=json'
]
def stress_test_sync(self, proxy_url, concurrent_requests=10, total_requests=100):
"""同步压力测试"""
print(f"开始压力测试: {proxy_url}")
print(f"并发数: {concurrent_requests}, 总请求数: {total_requests}")
results = {
'proxy': proxy_url,
'concurrent_requests': concurrent_requests,
'total_requests': total_requests,
'success_count': 0,
'failed_count': 0,
'response_times': [],
'errors': [],
'start_time': time.time(),
'end_time': None
}
def single_request():
try:
start = time.time()
response = requests.get(
'http://httpbin.org/ip',
proxies={'http': proxy_url, 'https': proxy_url},
timeout=30
)
end = time.time()
if response.status_code == 200:
return True, end - start, None
else:
return False, end - start, f"HTTP {response.status_code}"
except Exception as e:
return False, None, str(e)
# 使用线程池执行并发请求
with ThreadPoolExecutor(max_workers=concurrent_requests) as executor:
futures = [executor.submit(single_request) for _ in range(total_requests)]
for future in futures:
success, response_time, error = future.result()
if success:
results['success_count'] += 1
if response_time:
results['response_times'].append(response_time)
else:
results['failed_count'] += 1
if error:
results['errors'].append(error)
results['end_time'] = time.time()
# 计算统计信息
if results['response_times']:
results['avg_response_time'] = statistics.mean(results['response_times'])
results['min_response_time'] = min(results['response_times'])
results['max_response_time'] = max(results['response_times'])
results['median_response_time'] = statistics.median(results['response_times'])
results['success_rate'] = (results['success_count'] / total_requests) * 100
results['total_time'] = results['end_time'] - results['start_time']
results['requests_per_second'] = total_requests / results['total_time']
return results
async def stress_test_async(self, proxy_url, concurrent_requests=10, total_requests=100):
"""异步压力测试(更高效)"""
print(f"开始异步压力测试: {proxy_url}")
results = {
'proxy': proxy_url,
'concurrent_requests': concurrent_requests,
'total_requests': total_requests,
'success_count': 0,
'failed_count': 0,
'response_times': [],
'errors': [],
'start_time': time.time(),
'end_time': None
}
# 解析代理URL
proxy_parts = proxy_url.replace('http://', '').split('@')
if len(proxy_parts) == 2:
auth, host_port = proxy_parts
username, password = auth.split(':')
host, port = host_port.split(':')
proxy_auth = aiohttp.BasicAuth(username, password)
proxy_url_clean = f'http://{host}:{port}'
else:
proxy_auth = None
proxy_url_clean = proxy_url
async def single_request(session, semaphore):
async with semaphore:
try:
start = time.time()
async with session.get(
'http://httpbin.org/ip',
proxy=proxy_url_clean,
proxy_auth=proxy_auth,
timeout=aiohttp.ClientTimeout(total=30)
) as response:
end = time.time()
await response.text() # 读取响应内容
if response.status == 200:
return True, end - start, None
else:
return False, end - start, f"HTTP {response.status}"
except Exception as e:
return False, None, str(e)
# 创建信号量控制并发数
semaphore = asyncio.Semaphore(concurrent_requests)
async with aiohttp.ClientSession() as session:
tasks = [single_request(session, semaphore) for _ in range(total_requests)]
responses = await asyncio.gather(*tasks)
for success, response_time, error in responses:
if success:
results['success_count'] += 1
if response_time:
results['response_times'].append(response_time)
else:
results['failed_count'] += 1
if error:
results['errors'].append(error)
results['end_time'] = time.time()
# 计算统计信息
if results['response_times']:
results['avg_response_time'] = statistics.mean(results['response_times'])
results['min_response_time'] = min(results['response_times'])
results['max_response_time'] = max(results['response_times'])
results['median_response_time'] = statistics.median(results['response_times'])
results['success_rate'] = (results['success_count'] / total_requests) * 100
results['total_time'] = results['end_time'] - results['start_time']
results['requests_per_second'] = total_requests / results['total_time']
return results
def print_benchmark_results(self, results):
"""打印基准测试结果"""
print(f"\n{'='*60}")
print(f"压力测试结果: {results['proxy']}")
print(f"{'='*60}")
print(f"总请求数: {results['total_requests']}")
print(f"并发数: {results['concurrent_requests']}")
print(f"成功请求: {results['success_count']}")
print(f"失败请求: {results['failed_count']}")
print(f"成功率: {results['success_rate']:.2f}%")
print(f"总耗时: {results['total_time']:.2f}秒")
print(f"请求速率: {results['requests_per_second']:.2f} 请求/秒")
if results['response_times']:
print(f"\n响应时间统计:")
print(f" 平均: {results['avg_response_time']:.3f}秒")
print(f" 最小: {results['min_response_time']:.3f}秒")
print(f" 最大: {results['max_response_time']:.3f}秒")
print(f" 中位数: {results['median_response_time']:.3f}秒")
if results['errors']:
print(f"\n错误统计:")
error_counts = {}
for error in results['errors']:
error_counts[error] = error_counts.get(error, 0) + 1
for error, count in error_counts.items():
print(f" {error}: {count}次")
def compare_proxies(self, proxy_list, concurrent_requests=5, total_requests=50):
"""对比多个代理的性能"""
print(f"开始对比 {len(proxy_list)} 个代理的性能...")
all_results = []
for i, proxy in enumerate(proxy_list, 1):
print(f"\n[{i}/{len(proxy_list)}] 测试代理: {proxy}")
# 使用异步测试以提高效率
result = asyncio.run(self.stress_test_async(
proxy, concurrent_requests, total_requests
))
all_results.append(result)
self.print_benchmark_results(result)
# 生成对比报告
self._generate_comparison_report(all_results)
return all_results
def _generate_comparison_report(self, results):
"""生成对比报告"""
print(f"\n{'='*80}")
print("代理性能对比报告")
print(f"{'='*80}")
# 按成功率排序
results.sort(key=lambda x: x['success_rate'], reverse=True)
print(f"{'排名':<4} {'代理':<35} {'成功率':<8} {'平均响应时间':<12} {'请求速率':<12}")
print("-" * 80)
for i, result in enumerate(results, 1):
proxy_short = result['proxy'][:32] + "..." if len(result['proxy']) > 35 else result['proxy']
success_rate = f"{result['success_rate']:.1f}%"
avg_time = f"{result.get('avg_response_time', 0):.3f}s" if result.get('avg_response_time') else "N/A"
rps = f"{result['requests_per_second']:.1f}/s"
print(f"{i:<4} {proxy_short:<35} {success_rate:<8} {avg_time:<12} {rps:<12}")
# 推荐最佳代理
if results:
best_proxy = results[0]
print(f"\n🏆 推荐代理: {best_proxy['proxy']}")
print(f" 成功率: {best_proxy['success_rate']:.1f}%")
print(f" 平均响应时间: {best_proxy.get('avg_response_time', 0):.3f}秒")
# 使用示例
benchmark = ProxyBenchmark()
test_proxies = [
'http://user:pass@proxy1.com:8080',
'http://user:pass@proxy2.com:8080',
'http://user:pass@proxy3.com:8080',
]
# 对比测试
comparison_results = benchmark.compare_proxies(
test_proxies,
concurrent_requests=3,
total_requests=30
)
# 单独压力测试
# single_result = asyncio.run(benchmark.stress_test_async(
# 'http://user:pass@proxy1.com:8080',
# concurrent_requests=10,
# total_requests=100
# ))
# benchmark.print_benchmark_results(single_result)
代理健康度评分系统
综合评分算法
import math
from datetime import datetime, timedelta
class ProxyHealthScorer:
def __init__(self):
self.weights = {
'availability': 0.3, # 可用性权重
'speed': 0.25, # 速度权重
'stability': 0.25, # 稳定性权重
'anonymity': 0.15, # 匿名性权重
'geographic': 0.05 # 地理位置权重
}
def calculate_health_score(self, proxy_data):
"""计算代理健康度评分"""
scores = {}
# 可用性评分 (0-100)
scores['availability'] = self._score_availability(proxy_data)
# 速度评分 (0-100)
scores['speed'] = self._score_speed(proxy_data)
# 稳定性评分 (0-100)
scores['stability'] = self._score_stability(proxy_data)
# 匿名性评分 (0-100)
scores['anonymity'] = self._score_anonymity(proxy_data)
# 地理位置评分 (0-100)
scores['geographic'] = self._score_geographic(proxy_data)
# 计算加权总分
total_score = sum(
scores[metric] * self.weights[metric]
for metric in scores
)
return {
'total_score': round(total_score, 2),
'detailed_scores': scores,
'grade': self._get_grade(total_score),
'recommendations': self._get_recommendations(scores)
}
def _score_availability(self, data):
"""可用性评分"""
# 基于最近的成功率
recent_success_rate = data.get('recent_success_rate', 0)
# 基于连续失败次数的惩罚
consecutive_failures = data.get('consecutive_failures', 0)
failure_penalty = min(consecutive_failures * 10, 50)
# 基于总体正常运行时间
uptime = data.get('uptime_percentage', 0)
base_score = (recent_success_rate + uptime) / 2
final_score = max(0, base_score - failure_penalty)
return min(100, final_score)
def _score_speed(self, data):
"""速度评分"""
avg_response_time = data.get('avg_response_time', 999)
if avg_response_time == 0 or avg_response_time > 30:
return 0
# 响应时间评分曲线
if avg_response_time <= 1:
return 100
elif avg_response_time <= 2:
return 90
elif avg_response_time <= 3:
return 80
elif avg_response_time <= 5:
return 70
elif avg_response_time <= 8:
return 60
elif avg_response_time <= 12:
return 50
elif avg_response_time <= 20:
return 30
else:
return 10
def _score_stability(self, data):
"""稳定性评分"""
# 基于响应时间的稳定性
response_time_std = data.get('response_time_std', 0)
stability_score = max(0, 100 - response_time_std * 20)
# 基于服务中断频率
downtime_events = data.get('downtime_events_24h', 0)
downtime_penalty = min(downtime_events * 15, 60)
# 基于长期表现
long_term_uptime = data.get('uptime_7d', 0)
final_score = (stability_score + long_term_uptime) / 2 - downtime_penalty
return max(0, min(100, final_score))
def _score_anonymity(self, data):
"""匿名性评分"""
anonymity_level = data.get('anonymity_level', 'unknown')
anonymity_scores = {
'elite': 100,
'anonymous': 75,
'transparent': 25,
'unknown': 0
}
base_score = anonymity_scores.get(anonymity_level, 0)
# 检查是否有IP泄露历史
ip_leak_incidents = data.get('ip_leak_incidents', 0)
leak_penalty = min(ip_leak_incidents * 20, 50)
return max(0, base_score - leak_penalty)
def _score_geographic(self, data):
"""地理位置评分"""
# 基于目标地区匹配度
target_country = data.get('target_country')
proxy_country = data.get('proxy_country')
if target_country and proxy_country:
if target_country.lower() == proxy_country.lower():
return 100
else:
return 50
# 基于IP类型
is_datacenter = data.get('is_datacenter', True)
if is_datacenter:
return 60 # 数据中心IP评分较低
else:
return 90 # 住宅IP评分较高
def _get_grade(self, score):
"""根据评分获取等级"""
if score >= 90:
return 'A+'
elif score >= 80:
return 'A'
elif score >= 70:
return 'B+'
elif score >= 60:
return 'B'
elif score >= 50:
return 'C'
else:
return 'D'
def _get_recommendations(self, scores):
"""根据评分提供改进建议"""
recommendations = []
if scores['availability'] < 70:
recommendations.append("可用性较低,建议更换代理服务商或检查网络配置")
if scores['speed'] < 60:
recommendations.append("响应速度较慢,考虑选择地理位置更近的代理")
if scores['stability'] < 70:
recommendations.append("稳定性不佳,建议增加代理轮换或使用备用代理")
if scores['anonymity'] < 80:
recommendations.append("匿名性不足,建议使用高匿名代理或住宅IP")
if scores['geographic'] < 80:
recommendations.append("地理位置不匹配,建议选择目标地区的代理")
return recommendations
def batch_score_proxies(self, proxy_data_list):
"""批量评分代理"""
results = []
for proxy_data in proxy_data_list:
score_result = self.calculate_health_score(proxy_data)
score_result['proxy_url'] = proxy_data.get('proxy_url', 'Unknown')
results.append(score_result)
# 按总分排序
results.sort(key=lambda x: x['total_score'], reverse=True)
return results
def generate_health_report(self, scored_proxies):
"""生成健康度报告"""
print(f"\n{'='*80}")
print("代理健康度评估报告")
print(f"{'='*80}")
if not scored_proxies:
print("暂无代理数据")
return
# 总体统计
total_proxies = len(scored_proxies)
avg_score = sum(p['total_score'] for p in scored_proxies) / total_proxies
grade_counts = {}
for proxy in scored_proxies:
grade = proxy['grade']
grade_counts[grade] = grade_counts.get(grade, 0) + 1
print(f"总代理数: {total_proxies}")
print(f"平均评分: {avg_score:.2f}")
print(f"等级分布: {dict(sorted(grade_counts.items()))}")
# 详细列表
print(f"\n{'排名':<4} {'等级':<4} {'评分':<6} {'代理地址':<40} {'主要问题'}")
print("-" * 80)
for i, proxy in enumerate(scored_proxies[:10], 1): # 显示前10名
proxy_short = proxy['proxy_url'][:37] + "..." if len(proxy['proxy_url']) > 40 else proxy['proxy_url']
# 找出最低分的指标作为主要问题
detailed = proxy['detailed_scores']
min_metric = min(detailed.keys(), key=lambda k: detailed[k])
main_issue = f"{min_metric}({detailed[min_metric]:.0f})"
print(f"{i:<4} {proxy['grade']:<4} {proxy['total_score']:<6.1f} {proxy_short:<40} {main_issue}")
# 推荐和警告
excellent_proxies = [p for p in scored_proxies if p['total_score'] >= 90]
poor_proxies = [p for p in scored_proxies if p['total_score'] < 50]
if excellent_proxies:
print(f"\n🏆 优秀代理 ({len(excellent_proxies)}个):")
for proxy in excellent_proxies[:3]:
print(f" {proxy['proxy_url']} (评分: {proxy['total_score']})")
if poor_proxies:
print(f"\n⚠️ 需要关注的代理 ({len(poor_proxies)}个):")
for proxy in poor_proxies[:3]:
print(f" {proxy['proxy_url']} (评分: {proxy['total_score']})")
if proxy['recommendations']:
print(f" 建议: {proxy['recommendations'][0]}")
# 使用示例
scorer = ProxyHealthScorer()
# 模拟代理数据
sample_proxy_data = [
{
'proxy_url': 'http://user:pass@proxy1.com:8080',
'recent_success_rate': 95,
'consecutive_failures': 0,
'uptime_percentage': 98,
'avg_response_time': 1.2,
'response_time_std': 0.3,
'downtime_events_24h': 1,
'uptime_7d': 96,
'anonymity_level': 'elite',
'ip_leak_incidents': 0,
'is_datacenter': False,
'proxy_country': 'US',
'target_country': 'US'
},
{
'proxy_url': 'http://user:pass@proxy2.com:8080',
'recent_success_rate': 75,
'consecutive_failures': 2,
'uptime_percentage': 85,
'avg_response_time': 3.5,
'response_time_std': 1.2,
'downtime_events_24h': 3,
'uptime_7d': 80,
'anonymity_level': 'anonymous',
'ip_leak_incidents': 1,
'is_datacenter': True,
'proxy_country': 'DE',
'target_country': 'US'
}
]
# 批量评分
scored_results = scorer.batch_score_proxies(sample_proxy_data)
# 生成报告
scorer.generate_health_report(scored_results)
# 查看单个代理的详细评分
for result in scored_results:
print(f"\n代理: {result['proxy_url']}")
print(f"总评分: {result['total_score']} ({result['grade']})")
print("详细评分:")
for metric, score in result['detailed_scores'].items():
print(f" {metric}: {score:.1f}")
if result['recommendations']:
print("改进建议:")
for rec in result['recommendations']:
print(f" - {rec}")
实用建议
1. 检测频率建议
# 根据使用场景调整检测频率
detection_schedules = {
'production': {
'basic_check': '每5分钟',
'deep_analysis': '每小时',
'benchmark': '每天',
'health_score': '每6小时'
},
'development': {
'basic_check': '每15分钟',
'deep_analysis': '每4小时',
'benchmark': '每周',
'health_score': '每天'
},
'testing': {
'basic_check': '每次使用前',
'deep_analysis': '每次使用前',
'benchmark': '购买前',
'health_score': '购买前'
}
}
2. 自动化代理管理
class AutoProxyManager:
def __init__(self):
self.active_proxies = []
self.backup_proxies = []
self.blacklisted_proxies = []
def auto_rotate_bad_proxies(self):
"""自动轮换表现不佳的代理"""
for proxy in self.active_proxies[:]:
health_score = self.get_proxy_health_score(proxy)
if health_score < 50:
print(f"代理 {proxy} 表现不佳,移至备用列表")
self.active_proxies.remove(proxy)
self.backup_proxies.append(proxy)
# 从备用列表提升一个代理
if self.backup_proxies:
promoted = self.backup_proxies.pop(0)
self.active_proxies.append(promoted)
print(f"提升代理 {promoted} 到活跃列表")
def get_proxy_health_score(self, proxy):
# 这里调用前面的健康度评分系统
pass
3. 成本效益分析
def analyze_proxy_cost_effectiveness(proxy_results, monthly_cost):
"""分析代理的成本效益"""
for result in proxy_results:
success_rate = result.get('success_rate', 0)
avg_response_time = result.get('avg_response_time', 999)
# 计算有效性价比
if success_rate > 0:
cost_per_success = monthly_cost / (success_rate / 100)
efficiency_score = (success_rate / 100) / (avg_response_time + 1)
print(f"代理: {result['proxy']}")
print(f" 成功率: {success_rate:.1f}%")
print(f" 每成功请求成本: ${cost_per_success:.2f}")
print(f" 效率评分: {efficiency_score:.3f}")
总结
代理IP质量检测是一个系统性工程,需要从多个维度进行评估:
核心检测指标:
- 可用性:最基础的指标,决定代理能否正常工作
- 速度:影响爬虫效率的关键因素
- 稳定性:决定服务的可靠性
- 匿名性:影响反爬虫效果
- 地理位置:影响目标网站的访问效果
检测工具选择:
- 基础检测:适合日常快速验证
- 深度分析:适合购买前评估
- 压力测试:适合高并发场景验证
- 持续监控:适合生产环境使用
实用建议:
- 建立分层检测体系:基础检测→深度分析→持续监控
- 设置合理的评分标准:根据业务需求调整权重
- 自动化管理:减少人工干预,提高效率
- 成本控制:平衡质量和成本,选择性价比最高的方案
记住,没有完美的代理,只有最适合的代理。根据你的具体需求选择检测方法,定期评估和优化代理池,才能保证爬虫项目的稳定运行。