【Python——爬取次元小镇(已完善)】

import re
import time
import requests
from requests.exceptions import RequestException


def download_and_save(img_url, image_count, referer=None):
    """下载图片并保存"""
    headers = session.headers
    if referer:
        headers['Referer'] = referer
    try:
        img_response = session.get(img_url, headers=headers)
        img_response.raise_for_status()
        file_path = f'./pic/img{image_count}.jpg'
        with open(file_path, 'wb') as f:
            f.write(img_response.content)
        print(f"图片保存成功: {file_path}")
    except RequestException as e:
        print(f"下载图片失败: {e}")


# 初始化一个session对象来重用连接
session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
})

n = 0  # 记录所有检查过的图片数
m = 0  # 记录成功下载的图片数

# 遍历页面
for i in range(1, 335):
    base_url = 'https://dimtown.com/p/page/' if i > 1 else 'https://dimtown.com/p/'
    url = f'{base_url}{i}'  # 构建完整URL

    try:
        response = session.get(url)
        response.raise_for_status()  # 确保请求成功
        content = response.text

        datas = re.findall('data-original="(.*?)"', content)
        referers = re.findall('<div class="kzpost-data"><a href="(.*?)" target="_blank"', content)

        for data_index, data in enumerate(datas, start=1):
            n += 1

            if 'https://image.baidu.com' in data:
                m += 1
                thumb_results = re.findall(r'thumburl=https://baidu.com&url=(.*)', data)
                if thumb_results:
                    img_url = thumb_results[0]
                    download_and_save(img_url, m)

            elif 'https://cci1.dimtown.com' in data:
                m += 1
                # 假设每个data对应唯一的referer,简化逻辑
                download_and_save(data, m, referers[data_index - 1] if data_index <= len(referers) else None)

            # 加入简单的延时,防止过于频繁的请求
            time.sleep(1)  # 考虑调整这个值以符合网站的爬虫政策

    except RequestException as e:
        print(f"请求错误: {e}")
        continue
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值