python爬虫开发（代码+教程）

QQ_1309399183

已于 2023-12-22 15:42:34 修改

阅读量672

点赞数 2

文章标签： python 爬虫爬虫GUI制作

于 2023-05-15 15:24:51 首次发布

本文链接：https://blog.csdn.net/ALiLiLiYa/article/details/130685129

版权

简介：

该项目旨在提供一个简单易用的图片下载工具，能够从主流搜索引擎中爬取用户输入的关键词对应的原图URL并下载。它采用Python语言进行开发，使用了Requests、Selenium等库来实现功能。
在这里插入图片描述

功能：

该工具支持用户从Google、必应和百度三个主流搜索引擎中进行图片搜索，并提供GUI和CMD版本两种使用方式。用户可以通过GUI界面或命令行输入关键词，也可以通过输入包含关键词列表的文本文件来批量下载图片。

此外，该工具还支持多线程下载，可配置线程数以提高下载速度。用户还可以使用搜索引擎的条件查询（如 :site）来精确搜索所需图片。为方便科学上网用户，该工具还支持socks5和http代理的配置。
在这里插入图片描述

主要代码

# 完整代码：qq1309399183
# 定义一个生成Google图片搜索URL的函数
# keywords: 搜索关键词
# face_only: 是否只搜索包含人脸的图片
# safe_mode: 是否开启安全模式，仅搜索不包含敏感内容的图片
# image_type: 图片类型，如clipart、animated等
# color: 图片颜色，如红色、黑白等
def google_gen_query_url(keywords, face_only=False, safe_mode=False, image_type=None, color=None):
    # 基础URL
    base_url = "https://www.google.com/search?tbm=isch&hl=en"
    # 拼接搜索关键词
    keywords_str = "&q=" + quote(keywords)
    query_url = base_url + keywords_str
    
    # 根据是否开启安全模式拼接URL
    if safe_mode is True:
        query_url += "&safe=on"
    else:
        query_url += "&safe=off"
    
    filter_url = "&tbs="
    # 根据图片颜色拼接URL
    if color is not None:
        if color == "bw":
            filter_url += "ic:gray%2C"
        else:
            filter_url += "ic:specific%2Cisc:{}%2C".format(color.lower())
    # 根据图片类型拼接URL
    if image_type is not None:
        if image_type.lower() == "linedrawing":
            image_type = "lineart"
        filter_url += "itp:{}".format(image_type)
    # 根据是否只搜索包含人脸的图片拼接URL
    if face_only is True:
        filter_url += "itp:face"

    # 将筛选URL拼接至查询URL中
    query_url += filter_url
    return query_url

# 从网页中获取Google图片搜索结果的URL
# driver: webdriver对象
# max_number: 最大下载数量
# quiet: 是否开启静默模式
def google_image_url_from_webpage(driver, max_number, quiet=False):
    thumb_elements_old = []
    thumb_elements = []
    while True:
        try:
            # 查找缩略图元素
            thumb_elements = driver.find_elements(By.CLASS_NAME, "rg_i")
            my_print("Find {} images.".format(len(thumb_elements)), quiet)
            # 若数量达到最大值则跳出循环
            if len(thumb_elements) >= max_number:
                break
            # 若未发现新的缩略图，则跳出循环
            if len(thumb_elements) == len(thumb_elements_old):
                break
            thumb_elements_old = thumb_elements
            # 模拟滚动鼠标至页面底部，加载更多缩略图
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            # 查找"show more"按钮并点击，以加载更多缩略图
            show_more = driver.find_elements(By.CLASS_NAME, "mye4qd")
            if len(show_more) == 1 and show_more[0].is_displayed() and show_more[0].is_enabled():
                my_print("Click show_more button.", quiet)
                show_more[0].click()
            time.sleep(3)
        except Exception as e:
            print("Exception ", e)
            pass
    
    # 若未找到缩略图则返回空列表
    if len(thumb_elements) == 0:
        return []

    my_print("Click on each thumbnail image to get image url, may take a moment ...", quiet)

    retry_click = []
    for i, elem in enumerate(thumb_elements):
        try:
            # 点击每个缩略图，以获取其对应的原图URL
            if i != 0 and i % 50 == 0:
                my_print("{} thumbnail clicked.".format(i), quiet)
            if not elem.is_displayed() or not elem.is_enabled():
                retry_click.append(elem)
                continue
            elem.click()
        except Exception as e:
            print("Error while clicking in thumbnail:", e)
            retry_click.append(elem)

    # 对于未能成功点击的缩略图进行重新尝试
    if len(retry_click) > 0:    
        my_print("Retry some failed clicks ...", quiet)
        for elem in retry_click:
            try:
                if elem.is_displayed() and elem.is_enabled():
                    elem.click()
            except Exception as e:
                print("Error while retrying click:", e)
    
    # 获取所有原图URL并存储至列表中
    image_elements = driver.find_elements(By.CLASS_NAME, "islib")
    image_urls = list()
    url_pattern = r"imgurl=\S*&amp;imgrefurl"
    for image_element in image_elements

用法：

用户可以运行gui.py脚本启动GUI界面，
通过输入关键词或上传包含关键词列表的文本文件来进行图片搜索和下载。用户还可以通过选择搜索引擎、设置线程数和代理来进行个性化设置。

如果用户更喜欢使用命令行，可以直接运行命令行版本的脚本并按照提示操作即可完成相同的任务。无论是哪种方式，该工具都会自动爬取图片的原图URL并下载至本地。

简答的demo示例：

Python是一种流行的编程语言，它可以用来编写各种类型的爬虫程序，包括图片爬虫。在Python中，有许多第三方库可以帮助您编写图片爬虫，其中最常用的是BeautifulSoup和Requests。

以下是一个简单的Python图片爬虫的例子：

python

import requests
from bs4 import BeautifulSoup
import os

# 网页地址
url = "https://www.example.com"

# 发送请求并获取网页内容
response = requests1.get1(url)

# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(response.text, "html.parser")

# 找到所有图片链接
img_tags = soups.find_all("img")

# 创建本地目录用于存储图片
if not os.path.exists("images"):
    os.makedirs("images")

# 遍历所有图片链接并下载
for img in img_tags:
    try:
        img_url = img["src"]
        img_name = img_url.split("/")[-1]
        img_path = os.path.join("images", img_name)
        if not os.path.exists(img_path):
            img_data = requests.get(img_url).content
            with open(img_path, "wb") as handler:
                handler.write(img_data)
                print(f"Downloaded {img_name}")
        else:
            print(f"{img_name} already exists")
    except Exception as e:
        print(f"Error downloading {img_url}: {e}")