import re import time import requests from requests.exceptions import RequestException def download_and_save(img_url, image_count, referer=None): """下载图片并保存""" headers = session.headers if referer: headers['Referer'] = referer try: img_response = session.get(img_url, headers=headers) img_response.raise_for_status() file_path = f'./pic/img{image_count}.jpg' with open(file_path, 'wb') as f: f.write(img_response.content) print(f"图片保存成功: {file_path}") except RequestException as e: print(f"下载图片失败: {e}") # 初始化一个session对象来重用连接 session = requests.Session() session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36' }) n = 0 # 记录所有检查过的图片数 m = 0 # 记录成功下载的图片数 # 遍历页面 for i in range(1, 335): base_url = 'https://dimtown.com/p/page/' if i > 1 else 'https://dimtown.com/p/' url = f'{base_url}{i}' # 构建完整URL try: response = session.get(url) response.raise_for_status() # 确保请求成功 content = response.text datas = re.findall('data-original="(.*?)"', content) referers = re.findall('<div class="kzpost-data"><a href="(.*?)" target="_blank"', content) for data_index, data in enumerate(datas, start=1): n += 1 if 'https://image.baidu.com' in data: m += 1 thumb_results = re.findall(r'thumburl=https://baidu.com&url=(.*)', data) if thumb_results: img_url = thumb_results[0] download_and_save(img_url, m) elif 'https://cci1.dimtown.com' in data: m += 1 # 假设每个data对应唯一的referer,简化逻辑 download_and_save(data, m, referers[data_index - 1] if data_index <= len(referers) else None) # 加入简单的延时,防止过于频繁的请求 time.sleep(1) # 考虑调整这个值以符合网站的爬虫政策 except RequestException as e: print(f"请求错误: {e}") continue
【Python——爬取次元小镇(已完善)】
于 2024-09-14 10:12:43 首次发布