1、模块介绍
所谓BeautifulSoup模块是通过html源代码进行筛选类似于正则表达式那种类型
2、代码
import os
import requests
from bs4 import BeautifulSoup
from PIL import Image
from io import BytesIO
headers = {'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0Safari/537.36 Edg/122.0.0.0'}
res = requests.get('https://www.douban.com/')
output_dir = 'downloaded_images'
os.makedirs(output_dir, exist_ok=True)
soup = BeautifulSoup(res.text, 'html.parser')
img_tags = soup.find_all('img')
for idx, img in enumerate(img_tags):
img_url = img.get('src')
if not img_url:
continue
try:
response = requests.get(img_url)
response.raise_for_status() # 确保请求成功
img_name = f'image_{idx}.jpg'
img_path = os.path.join(output_dir, img_name)
with open(img_path, 'wb') as file:
file.write(response.content)
print(f"图片 {img_name} 已下载")
try:
image = Image.open(BytesIO(response.content))
image.verify() # 验证图片是否损坏
print(f"图片 {img_name} 通过自动检测")
except Exception as e:
print(f"图片 {img_name} 自动检测失败:{e}")
os.remove(img_path)
except Exception as e:
print(f"下载图片失败:{e}")
具体就长这个样子(这里res.text是做了一个转化在其他的地方可能会把res中去,后面那个是指定的解释器)