十、异步IO与高性能爬虫架构
10.1 基于aiohttp的异步爬虫
import aiohttp
import asyncio
from bs4 import BeautifulSoup
async def fetch(session, url):
try:
async with session.get(url, timeout=10) as response:
if response.status == 200:
return await response.text()
return None
except Exception as e:
print(f"请求失败: {str(e)}")
return None
async def parse_product(url):
async with aiohttp.ClientSession(
headers={'User-Agent': 'Mozilla/5.0'}
) as session:
html = await fetch(session, url)
if html:
soup = BeautifulSoup(html, 'lxml')
r