图片爬虫小脚本

一个基于 PyQt5 的图片爬取工具( PhotoSpideMan),可以从百度图片搜索中批量下载图片,并支持暂停、继续和取消下载功能。该工具提供了用户友好的界面,可以设置图片类别、数量、格式、保存路径等参数。支持日志保存和加载、图片预览和删除功能。exe审核过了上传。。

(图个方便,仅限百度目前,别乱扒奇奇怪怪的)

import sys
import os
import requests
import re
from PyQt5.QtWidgets import (QApplication, QWidget, QVBoxLayout, QHBoxLayout, QLabel, QLineEdit, QPushButton,
                             QFileDialog, QTextEdit, QProgressBar, QMessageBox, QMenuBar, QAction, QComboBox, QDialog,
                             QFrame, QSplitter, QSizePolicy, QStatusBar, QInputDialog)
from PyQt5.QtCore import QThread, pyqtSignal, Qt
from PyQt5.QtGui import QIcon, QFont, QPixmap


class ScraperThread(QThread):
    update_log = pyqtSignal(str)
    update_progress = pyqtSignal(int)
    update_counts = pyqtSignal(int, int)
    download_completed = pyqtSignal()

    def __init__(self, category_name, total_batches, save_path, image_format, timeout, max_retries, user_agent,
                 max_concurrent, proxy=None):
        super().__init__()
        self.category_name = category_name
        self.total_batches = total_batches
        self.save_path = save_path
        self.image_format = image_format
        self.timeout = timeout
        self.max_retries = max_retries
        self.user_agent = user_agent
        self.max_concurrent = max_concurrent
        self.proxy = proxy
        self.is_running = True
        self.is_paused = False
        self.success_count = 0
        self.fail_count = 0
        self.total_size = 0  # 记录下载的总大小
        self.start_time = None  # 记录下载开始时间

    def run(self):
        image_count = 0
        total_images = self.total_batches * 60  # 每批次60张
        image_urls = set()
        headers = {'User-Agent': self.user_agent}

        for batch in range(self.total_batches):
            if not self.is_running:
                break
            while self.is_paused:
                self.sleep(1)
            url = f'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={self.category_name}&pn={batch * 30}'
            response = requests.get(url, headers=headers, timeout=self.timeout, proxies=self.proxy)
            if response.status_code != 200:
                self.update_log.emit(f'无法访问URL: {url}')
                continue
            html_content = response.content.decode()
            image_links = re.findall('"objURL":"(.*?)",', html_content)

            for image_link in image_links:
                if not self.is_running:
                    break
                while self.is_paused:
                    self.sleep(1)
                if image_link not in image_urls:
                    for _ in range(self.max_retries):
                        try:
                            img_response = requests.get(image_link, headers=headers, timeout=self.timeout,
                                                        proxies=self.proxy)
                            if img_response.status_code == 200:
                                image_path = os.path.join(self.save_path,
                                                          f'{self.category_name}{image_count + 1}.{self.image_format}')
                                with open(image_path, 'wb') as file:
                                    file.write(img_response.content)
                                image_count += 1
                                self.success_count += 1
                                self.total_size += len(img_response.content)  # 增加下载大小
                                self.update_log.emit(f'正在下载第{image_count}张图片: {image_path}')
                                self.update_progress.emit(int((image_count / total_images) * 100))
                                self.update_counts.emit(self.success_count, self.fail_count)
                                image_urls.add(image_link)
                                break
                            else:
                                self.update_log.emit(f'下载失败: {image_link} 状态码: {img_response.status_code}')
                        except Exception as e:
                            self.update_log.emit(f'下载失败: {image_link} 错误: {str(e)}')
                    else:
                        self.fail_count += 1
                        self.update_counts.emit(self.success_count, self.fail_count)
                else:
                    self.update_log.emit(f'跳过重复图片: {image_link}')
        self.download_completed.emit()

    def stop(self):
        self.is_running = False

    def pause(self):
        self.is_paused = True

    def resume(self):
        self.is_paused = False


class ImageScraper(QWidget):
    def __init__(self):
        super().__init__()
        self.initUI()

    def initUI(self):
        self.setWindowTitle('PhotoSpideMan')
        self.setGeometry(300, 300, 800, 600)
        self.setWindowIcon(QIcon('app_icon.png'))

        # 菜单栏
        self.menu_bar = QMenuBar(self)
        self.save_log_action = QAction('保存日志', self)
        self.save_log_action.triggered.connect(self.save_log)
        self.load_log_action = QAction('加载日志', self)
        self.load_log_action.triggered.connect(self.load_log)
        self.preview_action = QAction('图片预览', self)
        self.preview_action.triggered.connect(self.preview_images)
        self.about_action = QAction('关于作者', self)
        self.about_action.triggered.connect(self.show_about)
        file_menu = self.menu_bar.addMenu('文件')
        file_menu.addAction(self.save_log_action)
        file_menu.addAction(self.load_log_action)
        file_menu.addAction(self.preview_action)
        help_menu = self.menu_bar.addMenu('帮助')
        help_menu.addAction(self.about_action)

        main_layout = QVBoxLayout()
        main_layout.setMenuBar(self.menu_bar)

        form_layout = QVBoxLayout()
        form_layout.setSpacing(10)
        form_layout.setContentsMargins(10, 10, 10, 10)

        # 类别输入
        self.category_label = QLabel('图片类别:')
        self.category_input = QLineEdit(self)
        self.category_input.setFixedHeight(30)  # 调整输入框高度
        form_layout.addWidget(self.category_label)
        form_layout.addWidget(self.category_input)

        # 数量输入
        self.number_label = QLabel('图片数量(1=60张, 2=120张):')
        self.number_input = QLineEdit(self)
        self.number_input.setFixedHeight(30)  # 调整输入框高度
        form_layout.addWidget(self.number_label)
        form_layout.addWidget(self.number_input)

        # 图片格式选择
        self.format_label = QLabel('图片格式:')
        self.format_combobox = QComboBox(self)
        self.format_combobox.setFixedHeight(30)  # 调整下拉框高度
        self.format_combobox.addItems(['jpg', 'png', 'gif'])
        form_layout.addWidget(self.format_label)
        form_layout.addWidget(self.format_combobox)

        # 超时设置
        self.timeout_label = QLabel('超时设置(秒):')
        self.timeout_input = QLineEdit(self)
        self.timeout_input.setFixedHeight(30)  # 调整输入框高度
        self.timeout_input.setText('10')
        form_layout.addWidget(self.timeout_label)
        form_layout.addWidget(self.timeout_input)

        # 最大重试次数
        self.retries_label = QLabel('最大重试次数:')
        self.retries_input = QLineEdit(self)
        self.retries_input.setFixedHeight(30)  # 调整输入框高度
        self.retries_input.setText('3')
        form_layout.addWidget(self.retries_label)
        form_layout.addWidget(self.retries_input)

        # User-Agent 选择
        self.user_agent_label = QLabel('User-Agent:')
        self.user_agent_combobox = QComboBox(self)
        self.user_agent_combobox.setFixedHeight(30)  # 调整下拉框高度
        self.user_agent_combobox.addItems([
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
        ])
        form_layout.addWidget(self.user_agent_label)
        form_layout.addWidget(self.user_agent_combobox)

        # 最大并发下载数
        self.concurrent_label = QLabel('最大并发下载数:')
        self.concurrent_input = QLineEdit(self)
        self.concurrent_input.setFixedHeight(30)  # 调整输入框高度
        self.concurrent_input.setText('5')
        form_layout.addWidget(self.concurrent_label)
        form_layout.addWidget(self.concurrent_input)

        # 代理设置
        self.proxy_label = QLabel('代理(可选):')
        self.proxy_input = QLineEdit(self)
        self.proxy_input.setFixedHeight(30)  # 调整输入框高度
        form_layout.addWidget(self.proxy_label)
        form_layout.addWidget(self.proxy_input)

        # 添加分隔线
        separator = QFrame()
        separator.setFrameShape(QFrame.HLine)
        separator.setFrameShadow(QFrame.Sunken)
        form_layout.addWidget(separator)

        # 保存路径设置
        save_path_layout = QHBoxLayout()
        self.save_path_label = QLabel('保存路径:')
        self.save_path_label.setFont(QFont('Arial', 10))
        self.save_path_input = QLineEdit(self)
        self.save_path_input.setFixedHeight(30)  # 调整输入框高度
        self.save_path_input.setText(os.getcwd())
        self.browse_button = QPushButton('浏览', self)
        self.browse_button.setFixedHeight(30)  # 调整按钮高度
        self.browse_button.clicked.connect(self.browse_folder)
        save_path_layout.addWidget(self.save_path_label)
        save_path_layout.addWidget(self.save_path_input)
        save_path_layout.addWidget(self.browse_button)
        form_layout.addLayout(save_path_layout)

        main_layout.addLayout(form_layout)

        # 控制按钮布局
        button_layout = QHBoxLayout()
        button_layout.setSpacing(10)

        self.start_button = QPushButton('开始下载', self)
        self.start_button.setFixedHeight(40)  # 调整按钮高度
        self.start_button.clicked.connect(self.start_download)
        button_layout.addWidget(self.start_button)

        self.pause_button = QPushButton('暂停下载', self)
        self.pause_button.setFixedHeight(40)  # 调整按钮高度
        self.pause_button.clicked.connect(self.pause_download)
        button_layout.addWidget(self.pause_button)

        self.resume_button = QPushButton('继续下载', self)
        self.resume_button.setFixedHeight(40)  # 调整按钮高度
        self.resume_button.clicked.connect(self.resume_download)
        button_layout.addWidget(self.resume_button)

        self.cancel_button = QPushButton('取消下载', self)
        self.cancel_button.setFixedHeight(40)  # 调整按钮高度
        self.cancel_button.clicked.connect(self.stop_download)
        button_layout.addWidget(self.cancel_button)

        self.clear_button = QPushButton('清空日志', self)
        self.clear_button.setFixedHeight(40)  # 调整按钮高度
        self.clear_button.clicked.connect(self.clear_log)
        button_layout.addWidget(self.clear_button)

        self.restart_button = QPushButton('再次爬取', self)
        self.restart_button.setFixedHeight(40)  # 调整按钮高度
        self.restart_button.clicked.connect(self.restart_download)
        self.restart_button.setEnabled(False)
        button_layout.addWidget(self.restart_button)

        self.delete_button = QPushButton('删除所有图片', self)
        self.delete_button.setFixedHeight(40)  # 调整按钮高度
        self.delete_button.clicked.connect(self.delete_all_images)
        button_layout.addWidget(self.delete_button)

        main_layout.addLayout(button_layout)

        # 进度条和日志显示区域
        self.progress_bar = QProgressBar(self)
        main_layout.addWidget(self.progress_bar)

        self.log_area = QTextEdit(self)
        self.log_area.setReadOnly(True)
        self.log_area.setFont(QFont('Courier', 10))
        main_layout.addWidget(self.log_area)

        self.counts_label = QLabel('成功: 0  失败: 0')
        main_layout.addWidget(self.counts_label)

        # 状态栏
        self.status_bar = QStatusBar(self)
        self.status_label = QLabel('状态: 准备就绪')
        self.status_bar.addWidget(self.status_label)
        main_layout.addWidget(self.status_bar)

        self.setLayout(main_layout)

    def browse_folder(self):
        folder = QFileDialog.getExistingDirectory(self, "选择文件夹")
        if folder:
            self.save_path_input.setText(folder)

    def start_download(self):
        self.restart_button.setEnabled(False)
        self.clear_log()
        category_name = self.category_input.text()
        number_input = self.number_input.text()
        image_format = self.format_combobox.currentText()
        save_path = self.save_path_input.text()
        timeout = int(self.timeout_input.text())
        max_retries = int(self.retries_input.text())
        user_agent = self.user_agent_combobox.currentText()
        max_concurrent = int(self.concurrent_input.text())
        proxy = self.proxy_input.text() if self.proxy_input.text() else None
        if number_input.isdigit() and int(number_input) in [1, 2]:
            self.scraper_thread = ScraperThread(category_name, int(number_input), save_path, image_format, timeout,
                                                max_retries, user_agent, max_concurrent, proxy)
            self.scraper_thread.update_log.connect(self.log_message)
            self.scraper_thread.update_progress.connect(self.progress_bar.setValue)
            self.scraper_thread.update_counts.connect(self.update_counts)
            self.scraper_thread.download_completed.connect(self.download_completed)
            self.status_label.setText('状态: 正在下载')
            self.scraper_thread.start()
        else:
            self.log_message('请输入有效的图片数量(1 或 2)')

    def pause_download(self):
        if hasattr(self, 'scraper_thread'):
            self.scraper_thread.pause()
            self.status_label.setText('状态: 已暂停')

    def resume_download(self):
        if hasattr(self, 'scraper_thread'):
            self.scraper_thread.resume()
            self.status_label.setText('状态: 正在下载')

    def stop_download(self):
        if hasattr(self, 'scraper_thread'):
            self.scraper_thread.stop()
            self.progress_bar.setValue(0)
            self.counts_label.setText('成功: 0  失败: 0')
            self.status_label.setText('状态: 已取消')
            self.restart_button.setEnabled(True)

    def clear_log(self):
        self.log_area.clear()

    def save_log(self):
        log_text = self.log_area.toPlainText()
        save_path, _ = QFileDialog.getSaveFileName(self, "保存日志", "", "文本文件 (*.txt);;所有文件 (*)")
        if save_path:
            with open(save_path, 'w', encoding='utf-8') as file:
                file.write(log_text)

    def load_log(self):
        load_path, _ = QFileDialog.getOpenFileName(self, "加载日志", "", "文本文件 (*.txt);;所有文件 (*)")
        if load_path:
            with open(load_path, 'r', encoding='utf-8') as file:
                log_text = file.read()
            self.log_area.setText(log_text)

    def preview_images(self):
        save_path = self.save_path_input.text()
        if os.path.exists(save_path):
            images = [f for f in os.listdir(save_path) if f.endswith(('.jpg', '.png', '.gif'))]
            if images:
                preview_dialog = PreviewDialog(images, save_path)
                preview_dialog.exec_()
            else:
                self.log_message('没有找到图片进行预览')
        else:
            self.log_message('保存路径不存在')

    def update_counts(self, success, fail):
        self.counts_label.setText(f'成功: {success}  失败: {fail}')

    def download_completed(self):
        QMessageBox.information(self, '完成', '所有图片下载完成!')
        self.restart_button.setEnabled(True)
        self.status_label.setText('状态: 下载完成')
        self.auto_save_log()

    def restart_download(self):
        self.save_log()
        self.start_download()

    def log_message(self, message):
        self.log_area.append(message)

    def show_about(self):
        QMessageBox.information(self, '关于作者', '哈哈哈\nEmail: 3217860797@qq.com')

    def delete_all_images(self):
        save_path = self.save_path_input.text()
        if os.path.exists(save_path):
            images = [f for f in os.listdir(save_path) if f.endswith(('.jpg', '.png', '.gif'))]
            if images:
                reply = QMessageBox.question(self, '删除所有图片', '确定要删除所有下载的图片吗?',
                                             QMessageBox.Yes | QMessageBox.No, QMessageBox.No)
                if reply == QMessageBox.Yes:
                    for image in images:
                        os.remove(os.path.join(save_path, image))
                    self.log_message('所有图片已删除')
            else:
                self.log_message('没有找到图片进行删除')
        else:
            self.log_message('保存路径不存在')

    def auto_save_log(self):
        log_text = self.log_area.toPlainText()
        save_path = os.path.join(self.save_path_input.text(), 'download_log.txt')
        with open(save_path, 'w', encoding='utf-8') as file:
            file.write(log_text)
        self.log_message(f'日志已自动保存到 {save_path}')


class PreviewDialog(QDialog):
    def __init__(self, images, save_path):
        super().__init__()
        self.setWindowTitle('图片预览')
        self.setGeometry(300, 300, 600, 400)
        layout = QVBoxLayout()
        for image in images:
            pixmap = QPixmap(os.path.join(save_path, image))
            label = QLabel(self)
            label.setPixmap(pixmap.scaled(200, 200, Qt.KeepAspectRatio))
            layout.addWidget(label)
        self.setLayout(layout)


if __name__ == '__main__':
    app = QApplication(sys.argv)
    ex = ImageScraper()
    ex.show()
    sys.exit(app.exec_())

爬虫代码借鉴:如何快速制作一个垃圾分类图像识别器(卷积神经网络)_tf-estimator-nightly==2.8.0.dev2021122109-CSDN博客

大水文解释:图片爬虫小脚本(他说写了送1500推广)-CSDN博客

exe下载:

链接:https://pan.baidu.com/s/1_B8F4LdvKYHZMNkrTPJJPA 
提取码:6666

  • 9
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

+1MB

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值