【PYTHON 随笔1】爬取百度图片

最新推荐文章于 2023-02-14 11:38:52 发布

**@Sun King**

最新推荐文章于 2023-02-14 11:38:52 发布

阅读量620

点赞数

分类专栏： Python笔记文章标签： python 爬虫

本文链接：https://blog.csdn.net/qq_44637632/article/details/122213480

版权

Python笔记专栏收录该内容

3 篇文章 0 订阅

订阅专栏

利用python-pyQT来爬取百度图片

【一】准备

知识准备

前端基础(html, css e.t.c.)
python基础
pyQT基础
下面的模块相关知识

模块准备

PtQt5 >> UI框架
re >> 正则表达式处理字符串
urllib3 >> python内置的http请求库
bs4 >> 处理html对象及其tag对象
requests >> 和urllib类似，为http请求库，但使用相对更为方便

构架准备

【二】代码实现

主目录下

__init__.py

import Control
import Service
import Client

Main.py

import Client

if __name__ == "__main__":
    Client.client_App.main_Application()

Client目录(与用户的交互)

__init__.py

from Client import client_App

client_App.py

#!/usr/bin/python3
# -*- coding: utf-8 -*-

import sys
# 这里我们提供必要的引用。基本控件位于pyqt5.qtwidgets模块中。样式调整在PyQt5.QtGui
from PyQt5.QtWidgets import QApplication, QWidget, QToolTip, QPushButton, QLineEdit, QMessageBox, QDesktopWidget, \
    QLabel, QFileDialog
from PyQt5.QtGui import QIcon, QFont

from Service import imageCollector as imgc


class Spider_Application(QWidget):
    def __init__(self):
        super().__init__()
        # 创建一个按键类对象
        self.btn = QPushButton("Go", self)
        self.save_btn = QPushButton("+", self)
        # 创建一个行编辑类的对象
        self.search_item = QLineEdit(self)
        self.pages = QLineEdit(self)
        self.save_path = QLineEdit(self)
        # 创建标签类对象
        self.search_label = QLabel("Search Item: ", self)
        self.pages_label = QLabel("Page Number: ", self)
        self.path_label = QLabel("Save Path: ", self)

        self.init_UI()

    # 初始化UI设计
    def init_UI(self):
        # 这种静态的方法设置一个用于显示工具提示的字体。我们使用10px滑体字体。
        QToolTip.setFont(QFont('SansSerif', 10))

        # 设置窗口
        # 设置窗口大小
        self.resize(600, 600)
        # 设置窗口标题
        self.setWindowTitle("BaiDu Image Spider")
        # 设置图标
        self.setWindowIcon(QIcon("resources/net.png"))
        # 设置居中
        self.center()
        # 设置风格
        self.setStyleSheet("QLineEdit{\n"
                           "    border:0px;    \n"
                           "    border-bottom: 2px solid #B3B3B3;\n"
                           "    font-family:\'Microsoft YaHei\';\n"
                           "    font-weight:bold;\n"
                           "    }\n"
                           "\n"
                           "QLineEdit:hover{\n"
                           "    border-bottom: 3px solid #66A3FF;\n"
                           "    }\n"
                           "\n"
                           "QLineEdit:focus{\n"
                           "    border-bottom: 3px solid #E680BD;\n"
                           "    }\n"
                           "\n"
                           "QMessageBox{\n"
                           "    background-image: url(resources/net.png);\n"
                           "    }\n"
                           "\n"
                           "")

        # 设置按键
        # 对按钮设置提示说明
        self.btn.setToolTip("To <b>submit</b> information.")
        self.save_btn.setToolTip("To <b>select Save Directory</b>.")
        # 按键设置为默认大小
        self.btn.resize(self.btn.sizeHint())
        self.save_btn.resize(15, 25)
        # 设置按键相对位置
        self.btn.move(250, 450)
        self.save_btn.move(435, 250)

        # 设置行编辑框
        # 大小设置
        self.search_item.resize(300, 25)
        self.pages.resize(300, 25)
        self.save_path.resize(285, 25)
        # 相对位置设置
        self.search_item.move(150, 150)
        self.pages.move(150, 200)
        self.save_path.move(150, 250)
        # 对文本框进行设置说明
        self.search_item.setToolTip("Enter <b> what you want to spider... </b>")
        self.pages.setToolTip("<b>How many images</b> you want to get?")
        self.save_path.setToolTip("Default save path is <b>Desktop</b>.")
        # 文本框不可输入字符
        self.save_path.setEnabled(False)

        # 设置标签类
        # 大小设置
        self.search_label.resize(100, 25)
        self.pages_label.resize(100, 25)
        self.path_label.resize(100, 25)
        # 相对位置设置
        self.search_label.move(45, 150)
        self.pages_label.move(45, 200)
        self.path_label.move(45, 250)

        # 设置按钮和文本框的关联关系
        self.btn.clicked.connect(lambda: self.btn_clickAction())
        self.save_btn.clicked.connect(lambda: self.file_btn_ClickAction())

        # 显示
        self.show()

    def btn_clickAction(self):
        # 打印出输入值
        item = self.search_item.text()
        num = self.pages.text()
        path = self.save_path.text()

        try:
            imgc.run_spider(item, int(num), path)
        except Exception:
            self.exception_process()

        print("USR INPUT:", item, num, path)

    # save_btn按键事件触发
    def file_btn_ClickAction(self):
        path = QFileDialog.getExistingDirectory(self, "Select Saving Path")
        self.save_path.setText(path)

    # 设置异常事件提示窗口
    def exception_process(self):
        self.setEnabled(False)
        box = QMessageBox(self)
        reply = box.question(box, 'Error', "Check your input information invalid?",
                             QMessageBox.Yes | QMessageBox.No,
                             QMessageBox.Yes)
        if reply == QMessageBox.Yes:
            box.close()
        else:
            box.close()
        self.setEnabled(True)

    # 设置窗口关闭事件，继承于QWidget类
    def closeEvent(self, event):
        reply = QMessageBox.question(self, 'Exit Spider', "Are you sure to quit?",
                                     QMessageBox.Yes | QMessageBox.No,
                                     QMessageBox.No)
        if reply == QMessageBox.Yes:
            event.accept()
        else:
            event.ignore()

    # 控制窗口显示在屏幕中心的方法
    def center(self):
        # 获得窗口
        qr = self.frameGeometry()
        # 获得屏幕中心点
        cp = QDesktopWidget().availableGeometry().center()
        # 显示到屏幕中心
        qr.moveCenter(cp)
        self.move(qr.topLeft())


def main_Application():
    # 每一pyqt5应用程序必须创建一个应用程序对象。sys.argv参数是一个列表，从命令行输入参数。
    app = QApplication(sys.argv)
    # 创建对象
    window = Spider_Application()
    # 系统exit()方法确保应用程序干净的退出
    # 的exec_()方法有下划线。因为执行是一个Python关键词。因此，exec_()代替
    sys.exit(app.exec_())


if __name__ == '__main__':
    main_Application()

Control目录（控件功能，工具函数）

__init__.py

from Control import file_op_Tools
from Control import url_Process

file_op_Tools.py

#!/usr/bin/python3
# -*- coding: utf-8 -*-
import os

# 获取用户桌面的绝对路径
def get_desk_p():
    return os.path.join(os.path.expanduser('~'), "Desktop")


# 创建文件夹
def mkdir(path):
    path = path.strip()
    isExists = os.path.exists(path)

    if not isExists:
        print('创建文件夹' + path)
        os.makedirs(path)
        return True
    else:
        print('文件夹已经创建')
        return False

url_Process.py

#!/usr/bin/python3
# -*- coding: utf-8 -*-
import requests

# 得到页面html代码
def getPage(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/79.0.3945.130 Safari/537.36'}
    html = requests.get(url, headers=headers).content.decode('utf-8')
    return html


# 获得新网页
def getNewPage(key):
    crude_url = "https://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1627513568150_R&pv=&ic=0&nc=1&z=&hd=&latest=&copyright=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1627513568151%5E00_1519X739&sid=&word="
    crude_url += key
    return crude_url

Service目录（核心爬虫代码）

__init__.py

from Service import imageCollector

imageCollector.py

#!/usr/bin/python3
# -*- coding: utf-8 -*-
import re
import time
import urllib

from bs4 import BeautifulSoup

from Control import file_op_Tools as ftool
from Control import url_Process as prc


# 输入图片的网络url路径和存储路径进行存储操作
def savaImg(picurl, saveurl):
    try:
        bytes = urllib.request.urlopen(picurl, timeout=15)
        file = open(saveurl, 'wb')
        file.write(bytes.read())
        file.flush()
        file.close()
        return True
    except Exception as e:
        print(picurl, ": ", str(e))
        return False


# 爬取图片
def run_spider(key, num, path):
    # 开始
    start = time.clock()

    # 初始化数据，若无路径，创建路径(默认路径为桌面)
    newhtml = prc.getNewPage(key)
    page_num = num
    diru = path
    # 检查路径合法性，并进行加工
    if len(diru) <= 0:
        diru = ftool.get_desk_p()+"\\SpiderImage"+"\\"+key+"\\"
    else:
        if diru[-1] != '\\':
            diru += '\\'
        diru += ("BaiDuSpider_"+key+"\\")
    ftool.mkdir(diru)

    print(newhtml)
    count = 0
    while True:
        # 页面内数据的准备, 获得网页对象，获得ThumbURL下的“***.jpg”存至列表
        url = newhtml
        html_text = prc.getPage(url)
        soup = BeautifulSoup(html_text, 'lxml')
        pictureimg = re.findall('"thumbURL":"(.*?)"', html_text)
        print("正在爬取页面：", pictureimg)

        # 页面内爬取所有图片
        for img in pictureimg:
            if savaImg(img, diru + key + str(count) + '.jpg'):
                print('保存成功：' + key+ str(count) + '.jpg')
                count += 1
                if count == page_num:
                    break
        if count == page_num:
            break

        # 获得下一页
        newhtml = "https://image.baidu.com" + soup.find('a', class_='n')['href']

    # 结束
    end = time.clock()
    print('下载成功，花费' + str(end - start) + '秒')


if __name__ == "__main__":
    item = input("你想要什么： ")
    num = int(input("想要多少张图片？： "))
    path = input("输入存储路径： ")

    run_spider(item, num, path)