前言
最近在学习pyqt5,也为了复习之前学过的爬虫知识,于是将两者结合起来做一个图片爬取的小应用。一、pyqt5和requests
pyqt5是用于创建图形界面的包,可以非常方便的创建一个客户端界面,直接用pip下载即可:
pip install pyqt5
推荐使用qtdesigner设计界面,具体用法网上有,这里不累赘,只要下载pyqt5就会相应的下载了qtdesigner,具体路径如下:
双击图中的designer.exe即可运行:
注意:
可能会运行不了,这里分享下自己使用的qtdesigner,且已经汉化,解压即可:
点击下载
提取码:vww3
界面如下:
界面代码如下:
# -*- coding: utf-8 -*-
# Form implementation generated from reading ui file 'untitled.ui'
#
# Created by: PyQt5 UI code generator 5.15.0
#
# WARNING: Any manual changes made to this file will be lost when pyuic5 is
# run again. Do not edit this file unless you know what you are doing.
from PyQt5 import QtCore, QtWidgets
from PyQt5.QtWidgets import QMessageBox, QWidget
import sys
from 代码 import baidu_spider
from 代码 import sougou_spider
import os
class Ui_Dialog(QWidget):
def __init__(self, parent=None):
super(Ui_Dialog, self).__init__(parent)
self.choice_list = ['', '百度', '搜狗'] #定义下拉框数据列表
self.setupUi(self)
self.retranslateUi(self)
def setupUi(self, Dialog):
Dialog.setObjectName("Dialog")
Dialog.resize(484, 389)
self.verticalLayout = QtWidgets.QVBoxLayout(Dialog)
self.verticalLayout.setObjectName("verticalLayout")
self.horizontalLayout = QtWidgets.QHBoxLayout()
self.horizontalLayout.setObjectName("horizontalLayout")
self.label = QtWidgets.QLabel(Dialog)
self.label.setObjectName("label")
self.horizontalLayout.addWidget(self.label)
self.comboBox = QtWidgets.QComboBox(Dialog)
sizePolicy = QtWidgets.QSizePolicy(QtWidgets.QSizePolicy.MinimumExpanding, QtWidgets.QSizePolicy.Fixed)
sizePolicy.setHorizontalStretch(0)
sizePolicy.setVerticalStretch(0)
sizePolicy.setHeightForWidth(self.comboBox.sizePolicy().hasHeightForWidth())
self.comboBox.setSizePolicy(sizePolicy)
self.comboBox.setObjectName("comboBox")
self.comboBox.addItems(self.choice_list)
self.horizontalLayout.addWidget(self.comboBox)
self.verticalLayout.addLayout(self.horizontalLayout)
self.horizontalLayout_2 = QtWidgets.QHBoxLayout()
self.horizontalLayout_2.setObjectName("horizontalLayout_2")
self.label_2 = QtWidgets.QLabel(Dialog)
self.label_2.setObjectName("label_2")
self.horizontalLayout_2.addWidget(self.label_2)
self.lineEdit = QtWidgets.QLineEdit(Dialog)
self.lineEdit.setObjectName("lineEdit")
self.horizontalLayout_2.addWidget(self.lineEdit)
self.label_4 = QtWidgets.QLabel(Dialog)
self.label_4.setObjectName("label_4")
self.horizontalLayout_2.addWidget(self.label_4)
self.lineEdit_type = QtWidgets.QLineEdit(Dialog)
self.lineEdit.setObjectName("lineEdit_type")
self.horizontalLayout_2.addWidget(self.lineEdit_type)
self.verticalLayout.addLayout(self.horizontalLayout_2)
self.horizontalLayout_3 = QtWidgets.QHBoxLayout()
self.horizontalLayout_3.setObjectName("horizontalLayout_3")
self.pushButton = QtWidgets.QPushButton(Dialog)
self.pushButton.setObjectName("pushButton")
self.horizontalLayout_3.addWidget(self.pushButton)
self.pushButton_clear = QtWidgets.QPushButton(Dialog)
self.pushButton_clear.setObjectName("pushButton_clear")
self.horizontalLayout_3.addWidget(self.pushButton_clear)
self.verticalLayout.addLayout(self.horizontalLayout_3)
self.label_3 = QtWidgets.QLabel(Dialog)
self.label_3.setObjectName("label_3")
self.verticalLayout.addWidget(self.label_3)
self.textEdit = QtWidgets.QTextEdit(Dialog)
self.textEdit.setObjectName("textEdit")
self.verticalLayout.addWidget(self.textEdit)
self.retranslateUi(Dialog)
QtCore.QMetaObject.connectSlotsByName(Dialog)
#定义按钮点击事件
self.pushButton.clicked.connect(self.start_spider)
self.pushButton_clear.clicked.connect(self.clear)
def retranslateUi(self, Dialog):
_translate = QtCore.QCoreApplication.translate
Dialog.setWindowTitle(_translate("Dialog", "图片爬取器"))
self.label.setText(_translate("Dialog", "请选择爬取图片的网站"))
self.label_2.setText(_translate("Dialog", "请输入爬取图片的数量"))
self.label_4.setText(_translate("Dialog", "请输入爬取图片的类型"))
self.pushButton.setText(_translate("Dialog", "开始爬取"))
self.pushButton_clear.setText(_translate("Dialog", "清空所有"))
self.label_3.setText(_translate("Dialog", "----------------------------------爬取结果-----------------------------------"))
#启动爬虫
def start_spider(self):
if self.comboBox.currentText() == '':
QMessageBox.warning(self, '警告', '请选择要爬取的网站!', QMessageBox.Yes)
elif self.lineEdit.text() == '':
QMessageBox.warning(self, '警告', '请输入要爬取图片的数量!', QMessageBox.Yes)
elif self.lineEdit_type.text() == '':
QMessageBox.warning(self, '警告', '请输入要爬取图片的类型!', QMessageBox.Yes)
else:
image_count, is_int = self.valid_int(self.lineEdit.text())
if is_int:
image_name = self.lineEdit_type.text()
self.textEdit.insertPlainText('--------图片开始爬取-----------\n')
# 判断用哪个搜索引擎爬取图片
if self.comboBox.currentText() == '百度':
# 返回是否存在参数和结果参数
exist, results = baidu_spider.main(image_name, image_count)
elif self.comboBox.currentText() == '搜狗':
# 返回是否存在参数和结果参数
exist, results = sougou_spider.main(image_name, image_count)
if exist:
self.textEdit.insertPlainText('该类型图片已存在!继续追加!\n')
for result in results:
self.textEdit.insertPlainText(result + '\n')
self.textEdit.insertPlainText('图片爬取完成!---------------\n')
else:
QMessageBox.warning(self, '警告', '图片的数量请输入整数!', QMessageBox.Yes)
# 清空历史记录
def clear(self):
self.textEdit.setText('')
self.lineEdit_type.setText('')
self.lineEdit.setText('')
self.comboBox.setCurrentIndex(0)
# 判断传入的数据是否是整数
def valid_int(self, text):
is_int = True
try:
res = int(text)
except:
is_int = False
res = 0
return res, is_int
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
ui = Ui_Dialog()
ui.show()
sys.exit(app.exec())
requests库用来爬起网页,我这里写了爬取百度和搜狗图片的爬虫,百度图片爬虫网上很多,这里就不详细展开;搜狗图片爬虫和百度图片爬虫大同小异,能理解百度爬虫,写搜狗爬虫也不难。代码如下:
baidu_spdier.py
import re
import requests
import os
def baidu_spider(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'Cookie': '自己浏览器的cookie'
}
result = requests.get(url, headers=headers)
if result.status_code == 200:
return result.text
#解析页面
def parse_html(html):
pattern = re.compile('.*?"middleURL":"(.*?)"', re.S) #获取图片url
items = pattern.findall(html) #获取图片url
return items
#下载图片
def download_img(items, img_name):
start_number = len(os.listdir()) + 1
for i in range(len(items)):
try:
image = requests.get(items[i])
result = '图片类型:' + img_name + ';正在下载第{}张图片。。。。。。。'.format(str(i+1))
f = open(img_name + '-' + str(i+start_number) + ".jpg", 'wb') #打开图片
f.write(image.content) #写入图片
f.close() #关闭
except:
result = '图片名:' + img_name + ';正在下载第{}张图片。。。。。。。'.format(str(i+1)) + '下载失败'
start_number -= 1
yield result
def main(img_name, img_count):
exist = 0
os.chdir('./图片/百度')
if img_name not in os.listdir():
#创建文件夹
os.mkdir(img_name)
start_number = 0
# 进入文件夹
os.chdir('./' + img_name)
else:
exist = 1
# 进入文件夹
os.chdir('./' + img_name)
# 判断开始爬取位置
start_number = len(os.listdir())
url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&is=&word={}&rn={}&pn={}'.format(
img_name, img_count, start_number)
html = baidu_spider(url)
items = parse_html(html)
results = download_img(items, img_name)
return exist, results
sougou_spider.py
import re
import requests
import os
def sougou_spider(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'Cookie': '自己浏览器的cookie'
}
result = requests.get(url, headers=headers)
if result.status_code == 200:
return result.text
#解析页面
def parse_html(html):
pattern = re.compile('.*?"items":\[(.*?)\],*?', re.S)
item = pattern.findall(html)
pattern_picurl = re.compile('.*?"picUrl":"(.*?)",', re.S)
items = pattern_picurl.findall(item[0])
return items
#下载图片
def download_img(items, img_name):
start_number = len(os.listdir()) + 1
for i in range(len(items)):
try:
image = requests.get(items[i])
result = '图片名:' + img_name + ';正在下载第{}张图片。。。。。。。'.format(str(i+1))
f = open(img_name + '-' + str(i+start_number) + ".jpg", 'wb') #打开图片
f.write(image.content) #写入图片
f.close() #关闭
except:
result = '图片名:' + img_name + ';正在下载第{}张图片。。。。。。。'.format(str(i+1)) + '下载失败'
start_number -= 1
yield result
def main(img_name, img_count):
exist = 0
os.chdir('./图片/搜狗')
# 判断是否要追加
if img_name not in os.listdir():
# 创建文件夹
os.mkdir(img_name)
start_number = 0
# 进入文件夹
os.chdir('./' + img_name)
else:
exist = 1
# 进入文件夹
os.chdir('./' + img_name)
start_number = len(os.listdir()) #计算开始位置,不重复爬取图片
url = 'https://pic.sogou.com/napi/pc/searchList?query={}&start={}&xml_len={}'.format(
img_name, start_number, img_count)
# 获取数据
html = sougou_spider(url)
# 解析数据
items = parse_html(html)
# 下载图片
results = download_img(items, img_name)
return exist, results
整个项目文件目录如下:
万事俱备,可以运行了!
二、运行效果
总结
这个小应用只是一时兴起才写的,会有很多不足的地方,感兴趣的小伙伴可以看下,也欢迎各位大佬前来指正。
人生苦短,我用Python
修正
1、之前犯了个错误,qtdesigner需要另外下载,并不是自带的,下载命令如下:
pip install pyqt5-tools
2、之前的代码有问题,接口路径改了,所以都重新改了;搜狗搜索无论怎么调参数每次都会拿46条数据,欢迎有解决方法的小伙伴在评论区留言。
不好意思,给大家带来不便,希望大家能纠正我的错误。