Python爬虫IP被封的5种解决方案

前言

做爬虫的朋友都遇到过这种情况:程序跑得好好的,突然就开始返回403错误,或者直接连接超时。十有八九是IP被网站封了。

现在的网站反爬虫越来越严格,稍微频繁一点就会被拉黑。今天分享几个实用的解决方案,都是我在实际项目中用过的。

方案一:代理IP池

这是最直接的办法,换个马甲继续干活。

基本实现

import requests
import random
import time

class ProxyPool:
    def __init__(self):
        # 这里放你的代理列表
        self.proxies = [
            'http://user:pass@proxy1.com:8080',
            'http://user:pass@proxy2.com:8080',
            'http://user:pass@proxy3.com:8080',
        ]
        self.failed_proxies = set()
    
    def get_proxy(self):
        available = [p for p in self.proxies if p not in self.failed_proxies]
        if not available:
            return None
        return {'http': random.choice(available), 'https': random.choice(available)}
    
    def mark_failed(self, proxy_url):
        self.failed_proxies.add(proxy_url)

def crawl_with_proxy(url):
    pool = ProxyPool()
    
    for attempt in range(3):  # 最多重试3次
        proxy = pool.get_proxy()
        if not proxy:
            break
            
        try:
            response = requests.get(url, proxies=proxy, timeout=10)
            if response.status_code == 200:
                return response
        except:
            pool.mark_failed(proxy.get('http'))
            time.sleep(1)
    
    return None

优缺点:

  • 优点:效果立竿见影,能快速解决封IP问题
  • 缺点:好的代理要花钱,免费的不稳定

方案二:控制请求频率

别太猴急,慢慢来。网站封你主要是因为你请求太频繁了。

智能延时

import time
import random
from datetime import datetime

class SmartDelay:
    def __init__(self):
        self.fail_count = 0
        self.success_count = 0
        self.last_request_time = None
    
    def wait(self):
        # 基础延时 1-3秒
        base_delay = random.uniform(1, 3)
        
        # 如果失败率高,增加延时
        if self.fail_count > 0:
            fail_rate = self.fail_count / (self.fail_count + self.success_count)
            if fail_rate > 0.3:  # 失败率超过30%
                base_delay *= 2
        
        print(f"等待 {base_delay:.1f} 秒...")
        time.sleep(base_delay)
        self.last_request_time = datetime.now()
    
    def record_result(self, success):
        if success:
            self.success_count += 1
            # 成功了可以稍微激进一点
            if self.fail_count > 0:
                self.fail_count -= 1
        else:
            self.fail_count += 1

# 使用示例
def crawl_slowly(urls):
    delay = SmartDelay()
    
    for i, url in enumerate(urls):
        if i > 0:  # 第一个请求不用等
            delay.wait()
        
        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                delay.record_result(True)
                print(f"✓ {url}")
            else:
                delay.record_result(False)
                print(f"✗ {url} - {response.status_code}")
        except Exception as e:
            delay.record_result(False)
            print(f"✗ {url} - {e}")

方案三:轮换User-Agent

固定的User-Agent就像在脸上写着"我是爬虫"。

import random

class UARotator:
    def __init__(self):
        self.user_agents = [
            # Windows Chrome
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            # Mac Chrome  
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            # Windows Firefox
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0',
            # Mac Safari
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
            # 手机浏览器
            'Mozilla/5.0 (iPhone; CPU iPhone OS 17_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1',
        ]
    
    def get_headers(self):
        return {
            'User-Agent': random.choice(self.user_agents),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        }

# 使用
ua = UARotator()
for url in urls:
    headers = ua.get_headers()
    response = requests.get(url, headers=headers)

方案四:Session复用

别每次都重新建连接,用Session保持状态。

import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

class SmartSession:
    def __init__(self):
        self.session = requests.Session()
        self.setup_session()
    
    def setup_session(self):
        # 设置重试策略
        retry_strategy = Retry(
            total=3,
            status_forcelist=[429, 500, 502, 503, 504],
            backoff_factor=1
        )
        
        adapter = HTTPAdapter(max_retries=retry_strategy)
        self.session.mount("http://", adapter)
        self.session.mount("https://", adapter)
        
        # 默认headers
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        })
    
    def get(self, url, **kwargs):
        return self.session.get(url, timeout=15, **kwargs)
    
    def close(self):
        self.session.close()

# 使用
session = SmartSession()
try:
    for url in urls:
        response = session.get(url)
        print(f"状态码: {response.status_code}")
        time.sleep(random.uniform(1, 3))
finally:
    session.close()

方案五:分布式爬虫

单机搞不定就上集群,多台机器分摊压力。

Redis任务队列

import redis
import json
import socket
from datetime import datetime

class DistributedCrawler:
    def __init__(self):
        self.redis_client = redis.Redis(host='localhost', port=6379, db=0)
        self.worker_id = socket.gethostname() + str(datetime.now().timestamp())[:10]
        self.task_queue = 'crawler:tasks'
        self.result_queue = 'crawler:results'
    
    def add_tasks(self, urls):
        """主节点添加任务"""
        for url in urls:
            task = {'url': url, 'retries': 0}
            self.redis_client.lpush(self.task_queue, json.dumps(task))
        print(f"添加了 {len(urls)} 个任务")
    
    def get_task(self):
        """工作节点获取任务"""
        task_data = self.redis_client.brpop(self.task_queue, timeout=10)
        return json.loads(task_data[1]) if task_data else None
    
    def save_result(self, result):
        """保存结果"""
        result['worker'] = self.worker_id
        result['time'] = datetime.now().isoformat()
        self.redis_client.lpush(self.result_queue, json.dumps(result))
    
    def start_worker(self):
        """启动工作进程"""
        print(f"Worker {self.worker_id} 开始工作...")
        
        while True:
            task = self.get_task()
            if not task:
                continue
            
            url = task['url']
            print(f"处理: {url}")
            
            try:
                response = requests.get(url, timeout=10)
                result = {
                    'url': url,
                    'status': 'success',
                    'status_code': response.status_code,
                    'size': len(response.content)
                }
                self.save_result(result)
                print(f"✓ 完成: {url}")
                
            except Exception as e:
                # 重试逻辑
                if task['retries'] < 3:
                    task['retries'] += 1
                    self.redis_client.lpush(self.task_queue, json.dumps(task))
                    print(f"↻ 重试: {url}")
                else:
                    result = {'url': url, 'status': 'failed', 'error': str(e)}
                    self.save_result(result)
                    print(f"✗ 失败: {url}")
            
            time.sleep(random.uniform(1, 3))

# 使用方法:
# 1. 在主节点运行:
# crawler = DistributedCrawler()
# crawler.add_tasks(['http://example.com', ...])

# 2. 在各个工作节点运行:
# crawler = DistributedCrawler()  
# crawler.start_worker()

进阶技巧

处理JavaScript渲染

有些网站内容是JS动态加载的,requests拿不到。

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

def get_js_content(url):
    options = Options()
    options.add_argument('--headless')  # 无界面模式
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    
    driver = webdriver.Chrome(options=options)
    try:
        driver.get(url)
        # 等待页面加载
        driver.implicitly_wait(10)
        return driver.page_source
    finally:
        driver.quit()

# 或者用更轻量的pyppeteer
import asyncio
from pyppeteer import launch

async def get_js_content_async(url):
    browser = await launch(headless=True)
    page = await browser.newPage()
    await page.goto(url)
    content = await page.content()
    await browser.close()
    return content

# 使用
# content = asyncio.run(get_js_content_async('https://example.com'))

简单的重试装饰器

def retry(times=3, delay=1):
    def decorator(func):
        def wrapper(*args, **kwargs):
            for i in range(times):
                try:
                    return func(*args, **kwargs)
                except Exception as e:
                    if i == times - 1:  # 最后一次重试
                        raise e
                    print(f"第{i+1}次重试失败: {e}")
                    time.sleep(delay * (i + 1))  # 递增延时
            return None
        return wrapper
    return decorator

# 使用
@retry(times=3, delay=2)
def fetch_url(url):
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    return response

实用建议

1. 检查robots.txt

做个有素质的爬虫:

import urllib.robotparser

def can_crawl(url, user_agent='*'):
    try:
        from urllib.parse import urljoin, urlparse
        base_url = f"{urlparse(url).scheme}://{urlparse(url).netloc}"
        robots_url = urljoin(base_url, '/robots.txt')
        
        rp = urllib.robotparser.RobotFileParser()
        rp.set_url(robots_url)
        rp.read()
        
        return rp.can_fetch(user_agent, url)
    except:
        return True  # 读取失败默认允许

2. 监控爬虫状态

简单的统计功能:

class CrawlerStats:
    def __init__(self):
        self.total = 0
        self.success = 0
        self.failed = 0
        self.start_time = time.time()
    
    def record(self, success):
        self.total += 1
        if success:
            self.success += 1
        else:
            self.failed += 1
    
    def print_stats(self):
        runtime = time.time() - self.start_time
        success_rate = (self.success / self.total * 100) if self.total > 0 else 0
        speed = self.total / runtime * 60 if runtime > 0 else 0
        
        print(f"\n统计信息:")
        print(f"运行时间: {runtime:.1f}秒")
        print(f"总请求: {self.total}")
        print(f"成功: {self.success} ({success_rate:.1f}%)")
        print(f"失败: {self.failed}")
        print(f"速度: {speed:.1f} 请求/分钟")

# 使用
stats = CrawlerStats()
for url in urls:
    try:
        response = requests.get(url)
        stats.record(response.status_code == 200)
    except:
        stats.record(False)
    
    if stats.total % 10 == 0:  # 每10个请求打印一次
        stats.print_stats()

总结

这5种方案各有适用场景:

  • 代理IP:见效最快,适合紧急情况
  • 控制频率:最基础的方法,任何时候都要用
  • 轮换UA:成本最低,效果不错
  • Session复用:提高效率,减少资源消耗
  • 分布式:大规模爬取的终极方案

实际项目中建议组合使用。比如:基础的频率控制+UA轮换+Session,遇到封IP再加代理池。

最后提醒一句:技术是用来解决问题的,不是用来搞破坏的。爬虫要有度,别给人家服务器造成太大压力。毕竟大家都要恰饭,相互理解一下。

记住一个原则:能用公开API就别爬,能慢慢爬就别猛爬,能少爬就别多爬

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值