搜狗图片爬虫

搜狗图片爬虫

爬取想要的搜狗图片,嘻嘻

import re
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urlparse, urlencode


def download_images(url, max_images):
    # 创建保存图片的文件夹
    if not os.path.exists("images"):
        os.makedirs("images")

    # 初始化请求变量 did
    did = 1
    downloaded_images = 0  # 记录已下载图片数量

    # 记录已下载图片的信息的文件路径
    record_file = "downloaded_images.txt"

    # 如果记录文件不存在,则创建并写入标题
    if not os.path.exists(record_file):
        with open(record_file, "w") as f:
            f.write("Downloaded Images\n")

    # 循环直到达到最大图片数量或无法获取更多图片为止
    while downloaded_images < max_images:
        try:
            # 更新请求参数中的 did
            params['did'] = str(did)

            # 发送请求获取页面内容
            response = requests.get(url, params=params)
            html_content = response.text

            # 使用BeautifulSoup解析HTML内容
            soup = BeautifulSoup(html_content, "html.parser")

            # 找到所有图片链接所在的标签
            img_tags = soup.find_all("img", {"drag-img": True})
            # 添加新的CSS选择器
            img_tags.extend(soup.select("#imgArea > div.img-preview-wrap > div > div > img"))

            # 提取图片链接
            image_urls = [img["drag-img"] for img in img_tags]

            # 遍历图片链接,下载并保存图片
            for image_url in image_urls:
                if downloaded_images >= max_images:
                    break
                try:
                    # 解析链接,检查是否已经包含协议部分
                    parsed_url = urlparse(image_url)
                    if not parsed_url.scheme:
                        # 添加协议部分
                        image_url = "https:" + image_url

                    # 提取图片名称
                    image_name = f"image_{downloaded_images + 1}.jpg"
                    # 保存图片到本地
                    image_path = os.path.join("images", image_name)

                    # 如果图片已经存在,则跳过
                    if os.path.exists(image_path):
                        print(f"图片 {image_name} 已存在,跳过下载!")
                        downloaded_images =downloaded_images+1
                        continue

                    response = requests.get(image_url, verify=False)
                    if response.status_code == 200:
                        with open(image_path, "wb") as f:
                            f.write(response.content)
                        print(f"图片 {image_name} 保存成功!")
                        downloaded_images += 1

                        # 记录已下载的图片信息到文件中
                        with open(record_file, "a") as f:
                            f.write(f"{image_name}: {image_url}\n")

                    else:
                        print(f"图片 {image_url} 下载失败!")
                except Exception as e:
                    print(f"下载图片 {image_url} 时出错: {e}")
                    continue

            did += 1  # 递增请求变量 did

        except Exception as e:
            print(f"请求页面 {url} 时出错: {e}")
            break


if __name__ == "__main__":
    query = "壁纸"  # 类别query参数,这里假设为墙纸
    base_url = "https://pic.sogou.com/d?"
    params = {
        "query": query,
        "forbidqc": "",
        "entityid": "",
        "preQuery": "",
        "rawQuery": "",
        "queryList": "",
        "st": "",
        "mode": "13",
        "cwidth": "1920",
        "cheight": "1080",
        "dm": "4",
        "did": "1"  # 初始设置为 1
    }
    max_images = 1000  # 最大图片数量
    url = base_url + urlencode(params)
    download_images(url, max_images)

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
当然可以!以下是一个使用Python编写的简单爬取搜狗图片的代码示例: ```python import requests import os def download_images(keyword, num_images): url = 'https://pic.sogou.com/pics' params = { 'query': keyword, 'mode': '1', 'start': '0', 'reqType': 'ajax', 'tn': '0', 'reqFrom': 'result', 'interV': '', 'category': '', '_asf': 'pic.sogou.com', '_ast': '1629966077', 'w': '01019900', 'p': '40030500', 'dp': '1', 'cid': '', 's': '', 'sut': '0', 'sst0': '1629966077042', 'lkt': '0,0,0', 'pfp': '', 'dpf': '', } headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36', } os.makedirs(keyword, exist_ok=True) for i in range(num_images): params['start'] = str(i * 48) response = requests.get(url, params=params, headers=headers) response_json = response.json() for item in response_json['items']: image_url = item['picUrl'] image_name = image_url.split('/')[-1] image_path = os.path.join(keyword, image_name) try: image_data = requests.get(image_url, headers=headers).content with open(image_path, 'wb') as f: f.write(image_data) print(f"成功下载图片:{image_name}") except Exception as e: print(f"下载图片失败:{image_name},错误信息:{str(e)}") # 调用函数进行图片下载 download_images('猫', 10) ``` 这段代码使用了requests库发送HTTP请求,通过搜狗图片的API接口获取图片链接,并将图片保存到本地。你可以根据需要修改关键词和下载数量来进行自定义。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值