一个基于 PyQt5 的图片爬取工具( PhotoSpideMan)
,可以从百度图片搜索中批量下载图片,并支持暂停、继续和取消下载功能。该工具提供了用户友好的界面,可以设置图片类别、数量、格式、保存路径等参数。支持日志保存和加载、图片预览和删除功能。exe审核过了上传。。
(图个方便,仅限百度目前,别乱扒奇奇怪怪的)
import sys
import os
import requests
import re
from PyQt5.QtWidgets import (QApplication, QWidget, QVBoxLayout, QHBoxLayout, QLabel, QLineEdit, QPushButton,
QFileDialog, QTextEdit, QProgressBar, QMessageBox, QMenuBar, QAction, QComboBox, QDialog,
QFrame, QSplitter, QSizePolicy, QStatusBar, QInputDialog)
from PyQt5.QtCore import QThread, pyqtSignal, Qt
from PyQt5.QtGui import QIcon, QFont, QPixmap
class ScraperThread(QThread):
update_log = pyqtSignal(str)
update_progress = pyqtSignal(int)
update_counts = pyqtSignal(int, int)
download_completed = pyqtSignal()
def __init__(self, category_name, total_batches, save_path, image_format, timeout, max_retries, user_agent,
max_concurrent, proxy=None):
super().__init__()
self.category_name = category_name
self.total_batches = total_batches
self.save_path = save_path
self.image_format = image_format
self.timeout = timeout
self.max_retries = max_retries
self.user_agent = user_agent
self.max_concurrent = max_concurrent
self.proxy = proxy
self.is_running = True
self.is_paused = False
self.success_count = 0
self.fail_count = 0
self.total_size = 0 # 记录下载的总大小
self.start_time = None # 记录下载开始时间
def run(self):
image_count = 0
total_images = self.total_batches * 60 # 每批次60张
image_urls = set()
headers = {'User-Agent': self.user_agent}
for batch in range(self.total_batches):
if not self.is_running:
break
while self.is_paused:
self.sleep(1)
url = f'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={self.category_name}&pn={batch * 30}'
response = requests.get(url, headers=headers, timeout=self.timeout, proxies=self.proxy)
if response.status_code != 200:
self.update_log.emit(f'无法访问URL: {url}')
continue
html_content = response.content.decode()
image_links = re.findall('"objURL":"(.*?)",', html_content)
for image_link in image_links:
if not self.is_running:
break
while self.is_paused:
self.sleep(1)
if image_link not in image_urls:
for _ in range(self.max_retries):
try:
img_response = requests.get(image_link, headers=headers, timeout=self.timeout,
proxies=self.proxy)
if img_response.status_code == 200:
image_path = os.path.join(self.save_path,
f'{self.category_name}{image_count + 1}.{self.image_format}')
with open(image_path, 'wb') as file:
file.write(img_response.content)
image_count += 1
self.success_count += 1
self.total_size += len(img_response.content) # 增加下载大小
self.update_log.emit(f'正在下载第{image_count}张图片: {image_path}')
self.update_progress.emit(int((image_count / total_images) * 100))
self.update_counts.emit(self.success_count, self.fail_count)
image_urls.add(image_link)
break
else:
self.update_log.emit(f'下载失败: {image_link} 状态码: {img_response.status_code}')
except Exception as e:
self.update_log.emit(f'下载失败: {image_link} 错误: {str(e)}')
else:
self.fail_count += 1
self.update_counts.emit(self.success_count, self.fail_count)
else:
self.update_log.emit(f'跳过重复图片: {image_link}')
self.download_completed.emit()
def stop(self):
self.is_running = False
def pause(self):
self.is_paused = True
def resume(self):
self.is_paused = False
class ImageScraper(QWidget):
def __init__(self):
super().__init__()
self.initUI()
def initUI(self):
self.setWindowTitle('PhotoSpideMan')
self.setGeometry(300, 300, 800, 600)
self.setWindowIcon(QIcon('app_icon.png'))
# 菜单栏
self.menu_bar = QMenuBar(self)
self.save_log_action = QAction('保存日志', self)
self.save_log_action.triggered.connect(self.save_log)
self.load_log_action = QAction('加载日志', self)
self.load_log_action.triggered.connect(self.load_log)
self.preview_action = QAction('图片预览', self)
self.preview_action.triggered.connect(self.preview_images)
self.about_action = QAction('关于作者', self)
self.about_action.triggered.connect(self.show_about)
file_menu = self.menu_bar.addMenu('文件')
file_menu.addAction(self.save_log_action)
file_menu.addAction(self.load_log_action)
file_menu.addAction(self.preview_action)
help_menu = self.menu_bar.addMenu('帮助')
help_menu.addAction(self.about_action)
main_layout = QVBoxLayout()
main_layout.setMenuBar(self.menu_bar)
form_layout = QVBoxLayout()
form_layout.setSpacing(10)
form_layout.setContentsMargins(10, 10, 10, 10)
# 类别输入
self.category_label = QLabel('图片类别:')
self.category_input = QLineEdit(self)
self.category_input.setFixedHeight(30) # 调整输入框高度
form_layout.addWidget(self.category_label)
form_layout.addWidget(self.category_input)
# 数量输入
self.number_label = QLabel('图片数量(1=60张, 2=120张):')
self.number_input = QLineEdit(self)
self.number_input.setFixedHeight(30) # 调整输入框高度
form_layout.addWidget(self.number_label)
form_layout.addWidget(self.number_input)
# 图片格式选择
self.format_label = QLabel('图片格式:')
self.format_combobox = QComboBox(self)
self.format_combobox.setFixedHeight(30) # 调整下拉框高度
self.format_combobox.addItems(['jpg', 'png', 'gif'])
form_layout.addWidget(self.format_label)
form_layout.addWidget(self.format_combobox)
# 超时设置
self.timeout_label = QLabel('超时设置(秒):')
self.timeout_input = QLineEdit(self)
self.timeout_input.setFixedHeight(30) # 调整输入框高度
self.timeout_input.setText('10')
form_layout.addWidget(self.timeout_label)
form_layout.addWidget(self.timeout_input)
# 最大重试次数
self.retries_label = QLabel('最大重试次数:')
self.retries_input = QLineEdit(self)
self.retries_input.setFixedHeight(30) # 调整输入框高度
self.retries_input.setText('3')
form_layout.addWidget(self.retries_label)
form_layout.addWidget(self.retries_input)
# User-Agent 选择
self.user_agent_label = QLabel('User-Agent:')
self.user_agent_combobox = QComboBox(self)
self.user_agent_combobox.setFixedHeight(30) # 调整下拉框高度
self.user_agent_combobox.addItems([
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
])
form_layout.addWidget(self.user_agent_label)
form_layout.addWidget(self.user_agent_combobox)
# 最大并发下载数
self.concurrent_label = QLabel('最大并发下载数:')
self.concurrent_input = QLineEdit(self)
self.concurrent_input.setFixedHeight(30) # 调整输入框高度
self.concurrent_input.setText('5')
form_layout.addWidget(self.concurrent_label)
form_layout.addWidget(self.concurrent_input)
# 代理设置
self.proxy_label = QLabel('代理(可选):')
self.proxy_input = QLineEdit(self)
self.proxy_input.setFixedHeight(30) # 调整输入框高度
form_layout.addWidget(self.proxy_label)
form_layout.addWidget(self.proxy_input)
# 添加分隔线
separator = QFrame()
separator.setFrameShape(QFrame.HLine)
separator.setFrameShadow(QFrame.Sunken)
form_layout.addWidget(separator)
# 保存路径设置
save_path_layout = QHBoxLayout()
self.save_path_label = QLabel('保存路径:')
self.save_path_label.setFont(QFont('Arial', 10))
self.save_path_input = QLineEdit(self)
self.save_path_input.setFixedHeight(30) # 调整输入框高度
self.save_path_input.setText(os.getcwd())
self.browse_button = QPushButton('浏览', self)
self.browse_button.setFixedHeight(30) # 调整按钮高度
self.browse_button.clicked.connect(self.browse_folder)
save_path_layout.addWidget(self.save_path_label)
save_path_layout.addWidget(self.save_path_input)
save_path_layout.addWidget(self.browse_button)
form_layout.addLayout(save_path_layout)
main_layout.addLayout(form_layout)
# 控制按钮布局
button_layout = QHBoxLayout()
button_layout.setSpacing(10)
self.start_button = QPushButton('开始下载', self)
self.start_button.setFixedHeight(40) # 调整按钮高度
self.start_button.clicked.connect(self.start_download)
button_layout.addWidget(self.start_button)
self.pause_button = QPushButton('暂停下载', self)
self.pause_button.setFixedHeight(40) # 调整按钮高度
self.pause_button.clicked.connect(self.pause_download)
button_layout.addWidget(self.pause_button)
self.resume_button = QPushButton('继续下载', self)
self.resume_button.setFixedHeight(40) # 调整按钮高度
self.resume_button.clicked.connect(self.resume_download)
button_layout.addWidget(self.resume_button)
self.cancel_button = QPushButton('取消下载', self)
self.cancel_button.setFixedHeight(40) # 调整按钮高度
self.cancel_button.clicked.connect(self.stop_download)
button_layout.addWidget(self.cancel_button)
self.clear_button = QPushButton('清空日志', self)
self.clear_button.setFixedHeight(40) # 调整按钮高度
self.clear_button.clicked.connect(self.clear_log)
button_layout.addWidget(self.clear_button)
self.restart_button = QPushButton('再次爬取', self)
self.restart_button.setFixedHeight(40) # 调整按钮高度
self.restart_button.clicked.connect(self.restart_download)
self.restart_button.setEnabled(False)
button_layout.addWidget(self.restart_button)
self.delete_button = QPushButton('删除所有图片', self)
self.delete_button.setFixedHeight(40) # 调整按钮高度
self.delete_button.clicked.connect(self.delete_all_images)
button_layout.addWidget(self.delete_button)
main_layout.addLayout(button_layout)
# 进度条和日志显示区域
self.progress_bar = QProgressBar(self)
main_layout.addWidget(self.progress_bar)
self.log_area = QTextEdit(self)
self.log_area.setReadOnly(True)
self.log_area.setFont(QFont('Courier', 10))
main_layout.addWidget(self.log_area)
self.counts_label = QLabel('成功: 0 失败: 0')
main_layout.addWidget(self.counts_label)
# 状态栏
self.status_bar = QStatusBar(self)
self.status_label = QLabel('状态: 准备就绪')
self.status_bar.addWidget(self.status_label)
main_layout.addWidget(self.status_bar)
self.setLayout(main_layout)
def browse_folder(self):
folder = QFileDialog.getExistingDirectory(self, "选择文件夹")
if folder:
self.save_path_input.setText(folder)
def start_download(self):
self.restart_button.setEnabled(False)
self.clear_log()
category_name = self.category_input.text()
number_input = self.number_input.text()
image_format = self.format_combobox.currentText()
save_path = self.save_path_input.text()
timeout = int(self.timeout_input.text())
max_retries = int(self.retries_input.text())
user_agent = self.user_agent_combobox.currentText()
max_concurrent = int(self.concurrent_input.text())
proxy = self.proxy_input.text() if self.proxy_input.text() else None
if number_input.isdigit() and int(number_input) in [1, 2]:
self.scraper_thread = ScraperThread(category_name, int(number_input), save_path, image_format, timeout,
max_retries, user_agent, max_concurrent, proxy)
self.scraper_thread.update_log.connect(self.log_message)
self.scraper_thread.update_progress.connect(self.progress_bar.setValue)
self.scraper_thread.update_counts.connect(self.update_counts)
self.scraper_thread.download_completed.connect(self.download_completed)
self.status_label.setText('状态: 正在下载')
self.scraper_thread.start()
else:
self.log_message('请输入有效的图片数量(1 或 2)')
def pause_download(self):
if hasattr(self, 'scraper_thread'):
self.scraper_thread.pause()
self.status_label.setText('状态: 已暂停')
def resume_download(self):
if hasattr(self, 'scraper_thread'):
self.scraper_thread.resume()
self.status_label.setText('状态: 正在下载')
def stop_download(self):
if hasattr(self, 'scraper_thread'):
self.scraper_thread.stop()
self.progress_bar.setValue(0)
self.counts_label.setText('成功: 0 失败: 0')
self.status_label.setText('状态: 已取消')
self.restart_button.setEnabled(True)
def clear_log(self):
self.log_area.clear()
def save_log(self):
log_text = self.log_area.toPlainText()
save_path, _ = QFileDialog.getSaveFileName(self, "保存日志", "", "文本文件 (*.txt);;所有文件 (*)")
if save_path:
with open(save_path, 'w', encoding='utf-8') as file:
file.write(log_text)
def load_log(self):
load_path, _ = QFileDialog.getOpenFileName(self, "加载日志", "", "文本文件 (*.txt);;所有文件 (*)")
if load_path:
with open(load_path, 'r', encoding='utf-8') as file:
log_text = file.read()
self.log_area.setText(log_text)
def preview_images(self):
save_path = self.save_path_input.text()
if os.path.exists(save_path):
images = [f for f in os.listdir(save_path) if f.endswith(('.jpg', '.png', '.gif'))]
if images:
preview_dialog = PreviewDialog(images, save_path)
preview_dialog.exec_()
else:
self.log_message('没有找到图片进行预览')
else:
self.log_message('保存路径不存在')
def update_counts(self, success, fail):
self.counts_label.setText(f'成功: {success} 失败: {fail}')
def download_completed(self):
QMessageBox.information(self, '完成', '所有图片下载完成!')
self.restart_button.setEnabled(True)
self.status_label.setText('状态: 下载完成')
self.auto_save_log()
def restart_download(self):
self.save_log()
self.start_download()
def log_message(self, message):
self.log_area.append(message)
def show_about(self):
QMessageBox.information(self, '关于作者', '哈哈哈\nEmail: 3217860797@qq.com')
def delete_all_images(self):
save_path = self.save_path_input.text()
if os.path.exists(save_path):
images = [f for f in os.listdir(save_path) if f.endswith(('.jpg', '.png', '.gif'))]
if images:
reply = QMessageBox.question(self, '删除所有图片', '确定要删除所有下载的图片吗?',
QMessageBox.Yes | QMessageBox.No, QMessageBox.No)
if reply == QMessageBox.Yes:
for image in images:
os.remove(os.path.join(save_path, image))
self.log_message('所有图片已删除')
else:
self.log_message('没有找到图片进行删除')
else:
self.log_message('保存路径不存在')
def auto_save_log(self):
log_text = self.log_area.toPlainText()
save_path = os.path.join(self.save_path_input.text(), 'download_log.txt')
with open(save_path, 'w', encoding='utf-8') as file:
file.write(log_text)
self.log_message(f'日志已自动保存到 {save_path}')
class PreviewDialog(QDialog):
def __init__(self, images, save_path):
super().__init__()
self.setWindowTitle('图片预览')
self.setGeometry(300, 300, 600, 400)
layout = QVBoxLayout()
for image in images:
pixmap = QPixmap(os.path.join(save_path, image))
label = QLabel(self)
label.setPixmap(pixmap.scaled(200, 200, Qt.KeepAspectRatio))
layout.addWidget(label)
self.setLayout(layout)
if __name__ == '__main__':
app = QApplication(sys.argv)
ex = ImageScraper()
ex.show()
sys.exit(app.exec_())
爬虫代码借鉴:如何快速制作一个垃圾分类图像识别器(卷积神经网络)_tf-estimator-nightly==2.8.0.dev2021122109-CSDN博客
大水文解释:图片爬虫小脚本(他说写了送1500推广)-CSDN博客
exe下载:
链接:https://pan.baidu.com/s/1_B8F4LdvKYHZMNkrTPJJPA
提取码:6666