我们通过 Crawl4AI 和 Ollama 中的本地模型:deepseek-r1:8b实现百度图片定向抓取
包含动态渲染、尺寸筛选和模型辅助验证,比如我们抓取包含 “韩立”的图片,
只要横版的,就是宽大于高的
一、技术方案设计
二、完整实现代码
1. 环境配置
# 安装增强依赖
pip install crawl4ai[async] pillow httpx
2. 核心代码实现
import asyncio
import httpx
from crawl4ai import AsyncWebCrawler
from PIL import Image
from io import BytesIO
from ollama import Client
class BaiduImageCrawler:
def __init__(self):
self.crawler = AsyncWebCrawler(
max_concurrency=10,
js_rendering=True,
render_wait=2,
headless_browser={
"viewport": {"width": 1920, "height": 1080}
}
)
self.ollama = Client(host="http://localhost:11434")
async def get_hanli_images(self, url):
"""主抓取流程"""
results = []
async for page in self.crawler.crawl(url):
if not page.success:
continue
# 提取图片信息
images = self.extract_image_data(page.html)
# 并行处理筛选
tasks = [
self.process_image(img)
for img in images
]
results += await asyncio.gather(*tasks)
return [img for img in results if img]
def extract_image_data(self, html):
"""解析图片基础信息"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
images = []
for div in soup.select('.imgbox'):
try:
img = {
'url': div.find('img')['data-src'],
'title': div.find('img')['alt'],
'thumbnail_size': (
int(div.find('img')['width']),
int(div.find('img')['height'])
)
}
images.append(img)
except:
continue
return images
async def process_image(self, img):
"""处理单张图片"""
# 阶段1:快速筛选
if img['thumbnail_size'][0] <= img['thumbnail_size'][1]:
return None
# 阶段2:模型验证
if not await self.validate_image(img['url']):
return None
# 获取高清原图
return await self.get_hd_image(img['url'])
async def validate_image(self, url):
"""使用模型验证图片内容"""
try:
# 获取图片基本信息
async with httpx.AsyncClient() as client:
resp = await client.get(url)
img = Image.open(BytesIO(resp.content))
width, height = img.size
# 二次验证宽高比
if width <= height:
return False
# 使用模型验证内容
response = self.ollama.generate(
model='deepseek-r1:8b',
prompt=f"这张图片是否包含韩立?只回答是或否\n图片URL:{url}",
options={"temperature": 0}
)
return "是" in response['response']
except:
return False
async def get_hd_image(self, thumb_url):
"""获取高清大图"""
# 百度图片高清图URL转换逻辑
hd_url = thumb_url.replace('thumbnail', 'large')
try:
async with httpx.AsyncClient() as client:
resp = await client.get(hd_url)
return {
'url': hd_url,
'size': Image.open(BytesIO(resp.content)).size
}
except:
return None
# 运行示例
async def main():
crawler = BaiduImageCrawler()
url = "https://image.baidu.com/search/index?tn=baiduimage&word=%E9%9F%A9%E7%AB%8B"
results = await crawler.get_hanli_images(url)
print(f"找到 {len(results)} 张符合条件的图片:")
for img in results:
print(f"URL: {img['url']} 尺寸: {img['size']}")
asyncio.run(main())
三、关键优化点
- 智能分阶段筛选
def size_filter(images):
"""分级筛选策略"""
# 第一级:缩略图快速筛选
stage1 = [img for img in images if img['width'] > img['height']]
# 第二级:实际尺寸验证
stage2 = []
for img in stage1:
actual_size = get_real_size(img['url'])
if actual_size[0] > actual_size[1]:
stage2.append(img)
# 第三级:模型内容验证
return model_verify(stage2)
- 百度图片URL转换逻辑
def convert_hd_url(thumb_url):
"""百度图片高清地址转换规则"""
patterns = [
(r'_b\d+\.jpg', '_large.jpg'),
(r'/thumbnail/', '/large/'),
(r'size=m\d+', 'size=hd1080')
]
for pat, rep in patterns:
thumb_url = re.sub(pat, rep, thumb_url)
return thumb_url
- 抗反爬策略
self.crawler = AsyncWebCrawler(
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Referer': 'https://image.baidu.com/'
},
proxy_rotation=True,
request_delay=2.5
)
四、执行结果示例
找到 23 张符合条件的图片:
URL: https://example.com/large/hanli_1.jpg 尺寸: (1920, 1080)
URL: https://example.com/large/hanli_2.jpg 尺寸: (2560, 1440)
...
五、注意事项
- 动态加载处理
# 在爬虫配置中添加滚动加载
self.crawler = AsyncWebCrawler(
scroll_down=3, # 滚动3次加载更多
scroll_interval=1.5
)
- 尺寸获取容错
async def get_real_size(url):
try:
async with httpx.AsyncClient() as client:
resp = await client.get(url, timeout=10)
img = Image.open(BytesIO(resp.content))
return img.size
except:
return (0, 0) # 返回无效尺寸
- 模型Prompt优化
prompt_template = """
请分析图片是否符合以下要求:
1. 主角是修仙小说角色"韩立"
2. 图片为横版(宽度 > 高度)
3. 非二次元画风
图片信息:
- 来源:百度图片搜索
- 尺寸:{width}x{height}
- 文件名:{filename}
请用JSON格式返回结果:
{
"is_hanli": bool,
"is_landscape": bool,
"confidence": 0-1
}"""
本方案通过以下技术组合实现精准抓取:
- 动态渲染:处理百度图片的JavaScript加载
- 三级筛选:缩略图尺寸 → 实际尺寸 → 模型验证
- 反反爬机制:自动代理轮换+请求头伪装
- 内容验证:本地模型确保图片主题正确性
实际运行前请确保Ollama服务已正确加载deepseek-r1:8b模型。