前言
做爬虫的朋友都遇到过这种情况:程序跑得好好的,突然就开始返回403错误,或者直接连接超时。十有八九是IP被网站封了。
现在的网站反爬虫越来越严格,稍微频繁一点就会被拉黑。今天分享几个实用的解决方案,都是我在实际项目中用过的。
方案一:代理IP池
这是最直接的办法,换个马甲继续干活。
基本实现
import requests
import random
import time
class ProxyPool:
def __init__(self):
# 这里放你的代理列表
self.proxies = [
'http://user:pass@proxy1.com:8080',
'http://user:pass@proxy2.com:8080',
'http://user:pass@proxy3.com:8080',
]
self.failed_proxies = set()
def get_proxy(self):
available = [p for p in self.proxies if p not in self.failed_proxies]
if not available:
return None
return {'http': random.choice(available), 'https': random.choice(available)}
def mark_failed(self, proxy_url):
self.failed_proxies.add(proxy_url)
def crawl_with_proxy(url):
pool = ProxyPool()
for attempt in range(3): # 最多重试3次
proxy = pool.get_proxy()
if not proxy:
break
try:
response = requests.get(url, proxies=proxy, timeout=10)
if response.status_code == 200:
return response
except:
pool.mark_failed(proxy.get('http'))
time.sleep(1)
return None
优缺点:
- 优点:效果立竿见影,能快速解决封IP问题
- 缺点:好的代理要花钱,免费的不稳定
方案二:控制请求频率
别太猴急,慢慢来。网站封你主要是因为你请求太频繁了。
智能延时
import time
import random
from datetime import datetime
class SmartDelay:
def __init__(self):
self.fail_count = 0
self.success_count = 0
self.last_request_time = None
def wait(self):
# 基础延时 1-3秒
base_delay = random.uniform(1, 3)
# 如果失败率高,增加延时
if self.fail_count > 0:
fail_rate = self.fail_count / (self.fail_count + self.success_count)
if fail_rate > 0.3: # 失败率超过30%
base_delay *= 2
print(f"等待 {base_delay:.1f} 秒...")
time.sleep(base_delay)
self.last_request_time = datetime.now()
def record_result(self, success):
if success:
self.success_count += 1
# 成功了可以稍微激进一点
if self.fail_count > 0:
self.fail_count -= 1
else:
self.fail_count += 1
# 使用示例
def crawl_slowly(urls):
delay = SmartDelay()
for i, url in enumerate(urls):
if i > 0: # 第一个请求不用等
delay.wait()
try:
response = requests.get(url, timeout=10)
if response.status_code == 200:
delay.record_result(True)
print(f"✓ {url}")
else:
delay.record_result(False)
print(f"✗ {url} - {response.status_code}")
except Exception as e:
delay.record_result(False)
print(f"✗ {url} - {e}")
方案三:轮换User-Agent
固定的User-Agent就像在脸上写着"我是爬虫"。
import random
class UARotator:
def __init__(self):
self.user_agents = [
# Windows Chrome
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
# Mac Chrome
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
# Windows Firefox
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0',
# Mac Safari
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
# 手机浏览器
'Mozilla/5.0 (iPhone; CPU iPhone OS 17_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1',
]
def get_headers(self):
return {
'User-Agent': random.choice(self.user_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
}
# 使用
ua = UARotator()
for url in urls:
headers = ua.get_headers()
response = requests.get(url, headers=headers)
方案四:Session复用
别每次都重新建连接,用Session保持状态。
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
class SmartSession:
def __init__(self):
self.session = requests.Session()
self.setup_session()
def setup_session(self):
# 设置重试策略
retry_strategy = Retry(
total=3,
status_forcelist=[429, 500, 502, 503, 504],
backoff_factor=1
)
adapter = HTTPAdapter(max_retries=retry_strategy)
self.session.mount("http://", adapter)
self.session.mount("https://", adapter)
# 默认headers
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
})
def get(self, url, **kwargs):
return self.session.get(url, timeout=15, **kwargs)
def close(self):
self.session.close()
# 使用
session = SmartSession()
try:
for url in urls:
response = session.get(url)
print(f"状态码: {response.status_code}")
time.sleep(random.uniform(1, 3))
finally:
session.close()
方案五:分布式爬虫
单机搞不定就上集群,多台机器分摊压力。
Redis任务队列
import redis
import json
import socket
from datetime import datetime
class DistributedCrawler:
def __init__(self):
self.redis_client = redis.Redis(host='localhost', port=6379, db=0)
self.worker_id = socket.gethostname() + str(datetime.now().timestamp())[:10]
self.task_queue = 'crawler:tasks'
self.result_queue = 'crawler:results'
def add_tasks(self, urls):
"""主节点添加任务"""
for url in urls:
task = {'url': url, 'retries': 0}
self.redis_client.lpush(self.task_queue, json.dumps(task))
print(f"添加了 {len(urls)} 个任务")
def get_task(self):
"""工作节点获取任务"""
task_data = self.redis_client.brpop(self.task_queue, timeout=10)
return json.loads(task_data[1]) if task_data else None
def save_result(self, result):
"""保存结果"""
result['worker'] = self.worker_id
result['time'] = datetime.now().isoformat()
self.redis_client.lpush(self.result_queue, json.dumps(result))
def start_worker(self):
"""启动工作进程"""
print(f"Worker {self.worker_id} 开始工作...")
while True:
task = self.get_task()
if not task:
continue
url = task['url']
print(f"处理: {url}")
try:
response = requests.get(url, timeout=10)
result = {
'url': url,
'status': 'success',
'status_code': response.status_code,
'size': len(response.content)
}
self.save_result(result)
print(f"✓ 完成: {url}")
except Exception as e:
# 重试逻辑
if task['retries'] < 3:
task['retries'] += 1
self.redis_client.lpush(self.task_queue, json.dumps(task))
print(f"↻ 重试: {url}")
else:
result = {'url': url, 'status': 'failed', 'error': str(e)}
self.save_result(result)
print(f"✗ 失败: {url}")
time.sleep(random.uniform(1, 3))
# 使用方法:
# 1. 在主节点运行:
# crawler = DistributedCrawler()
# crawler.add_tasks(['http://example.com', ...])
# 2. 在各个工作节点运行:
# crawler = DistributedCrawler()
# crawler.start_worker()
进阶技巧
处理JavaScript渲染
有些网站内容是JS动态加载的,requests拿不到。
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def get_js_content(url):
options = Options()
options.add_argument('--headless') # 无界面模式
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)
try:
driver.get(url)
# 等待页面加载
driver.implicitly_wait(10)
return driver.page_source
finally:
driver.quit()
# 或者用更轻量的pyppeteer
import asyncio
from pyppeteer import launch
async def get_js_content_async(url):
browser = await launch(headless=True)
page = await browser.newPage()
await page.goto(url)
content = await page.content()
await browser.close()
return content
# 使用
# content = asyncio.run(get_js_content_async('https://example.com'))
简单的重试装饰器
def retry(times=3, delay=1):
def decorator(func):
def wrapper(*args, **kwargs):
for i in range(times):
try:
return func(*args, **kwargs)
except Exception as e:
if i == times - 1: # 最后一次重试
raise e
print(f"第{i+1}次重试失败: {e}")
time.sleep(delay * (i + 1)) # 递增延时
return None
return wrapper
return decorator
# 使用
@retry(times=3, delay=2)
def fetch_url(url):
response = requests.get(url, timeout=10)
response.raise_for_status()
return response
实用建议
1. 检查robots.txt
做个有素质的爬虫:
import urllib.robotparser
def can_crawl(url, user_agent='*'):
try:
from urllib.parse import urljoin, urlparse
base_url = f"{urlparse(url).scheme}://{urlparse(url).netloc}"
robots_url = urljoin(base_url, '/robots.txt')
rp = urllib.robotparser.RobotFileParser()
rp.set_url(robots_url)
rp.read()
return rp.can_fetch(user_agent, url)
except:
return True # 读取失败默认允许
2. 监控爬虫状态
简单的统计功能:
class CrawlerStats:
def __init__(self):
self.total = 0
self.success = 0
self.failed = 0
self.start_time = time.time()
def record(self, success):
self.total += 1
if success:
self.success += 1
else:
self.failed += 1
def print_stats(self):
runtime = time.time() - self.start_time
success_rate = (self.success / self.total * 100) if self.total > 0 else 0
speed = self.total / runtime * 60 if runtime > 0 else 0
print(f"\n统计信息:")
print(f"运行时间: {runtime:.1f}秒")
print(f"总请求: {self.total}")
print(f"成功: {self.success} ({success_rate:.1f}%)")
print(f"失败: {self.failed}")
print(f"速度: {speed:.1f} 请求/分钟")
# 使用
stats = CrawlerStats()
for url in urls:
try:
response = requests.get(url)
stats.record(response.status_code == 200)
except:
stats.record(False)
if stats.total % 10 == 0: # 每10个请求打印一次
stats.print_stats()
总结
这5种方案各有适用场景:
- 代理IP:见效最快,适合紧急情况
- 控制频率:最基础的方法,任何时候都要用
- 轮换UA:成本最低,效果不错
- Session复用:提高效率,减少资源消耗
- 分布式:大规模爬取的终极方案
实际项目中建议组合使用。比如:基础的频率控制+UA轮换+Session,遇到封IP再加代理池。
最后提醒一句:技术是用来解决问题的,不是用来搞破坏的。爬虫要有度,别给人家服务器造成太大压力。毕竟大家都要恰饭,相互理解一下。
记住一个原则:能用公开API就别爬,能慢慢爬就别猛爬,能少爬就别多爬。