Python爬虫15--爬虫遇上多线程，速度更上一层楼，爬取1000张图片连一分钟也不要！

本文链接：https://blog.csdn.net/weixin_47326735/article/details/112137525

将多线程和爬虫结合，能将爬取速度更上一层楼，爬取1000张图片连一分钟也不要！

百度图片对于爬虫相当的友好，基本不需要做反反爬虫的设置，点赞 :)

1.分析百度图片网址：

从百度进去搜索图片，例如搜索 “拳皇” ，很大可能会看到地址栏出现一大坨东西，就像这样：

https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1609662428291_R&pv=&ic=&nc=1&z=&hd=&latest=&copyright=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&hs=2&sid=&word=%E6%8B%B3%E7%9A%87

看的人头疼，将这个难看的URL地址拿到URL解析网站解析一下看看，发现最后的一串数据就是我们要找的图片名称：

其实，URL中有很多内容对于我们爬虫来说，其实是可有可无的，大部分 “&” 符号后面的参数都可以去掉，“?”之后的参数也是如此，但这个URL是例外。

不认识的都可以试着删除，要用的话大不了之后再加回来，得到了如下URL地址，怎么样，舒服多了吧。这就是初始URL了。

https://image.baidu.com/search/index?tn=baiduimage&word=%E6%8B%B3%E7%9A%87

2.寻找图片信息

打开浏览器自带的抓包工具，重新加载一下网站，耐心的寻找响应，最后终于找到了。图片的地址居然藏在Json中，那事情似乎变得简单了起来。

3.直接上代码吧，请求图片可能比较慢，直接上5个线程。不用怕速度太快被反爬

import requests
import json
import threading
from queue import Queue
import os

class ImgSpider:
    def __init__(self, img_name, list_number):
        self.img_name = img_name
        self.list_number = int(list_number//30) + 1
        self.original_url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&pn={}&ipn=rj&word={}'
        self.start_url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&pn=0&ipn=rj&word={}'.format(img_name)
        self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"}
        self.url_list_queue = Queue()
        self.response_url_queue = Queue()
        self.img_list_queue = Queue()
        self.img_content_queue = Queue()

    # 生成目标url列表
    def url_list(self):
        for i in range(self.list_number):
            url = self.original_url.format(i * 30, self.img_name)
            self.url_list_queue.put(url)

    # 对目标url列表进行爬取
    def parse_url(self):
        while True:
            url = self.url_list_queue.get()
            response = requests.get(url, headers=self.headers)
            self.response_url_queue.put(response.content.decode())
            self.url_list_queue.task_done()

    # 提取图片地址
    def get_img_url(self):
        while True:
            content = self.response_url_queue.get()
            content = json.loads(content)
            for i in content["data"][0:-1]:
                url = i['thumbURL']
                # print(url)
                self.img_list_queue.put(url)
            self.response_url_queue.task_done()

    # 从图片列表中取出地址发送请求
    def parse_img_list(self):
        while True:
            url = self.img_list_queue.get()
            print('正在请求:', url)
            response = requests.get(url, headers=self.headers)
            self.img_content_queue.put(response.content)
            self.img_list_queue.task_done()

    # 保存图片
    def save_img(self):
        os.mkdir(self.img_name)
        count = 0
        while True:
            if self.img_content_queue:
                path = '{}/{}'.format(self.img_name, self.img_name)
                with open(path + '{}.png'.format(count), 'wb') as f:
                    content = self.img_content_queue.get()
                    f.write(content)
                    count += 1
                    self.img_content_queue.task_done()

    def run(self):
        # 1.获取start_url,构造url_list
        self.url_list()
        # 2.发送请求获取url信息
        ls = []
        t1 = threading.Thread(target=self.parse_url)
        ls.append(t1)
        # 3.提取图片地址
        t2 = threading.Thread(target=self.get_img_url)
        ls.append(t2)
        # 4.请求图片信息
        for i in range(5):
            t3 = threading.Thread(target=self.parse_img_list)
            ls.append(t3)
        # 5.保存图片
        t4 = threading.Thread(target=self.save_img)
        ls.append(t4)
        for i in ls:
            i.setDaemon(True)
            i.start()
        self.url_list_queue.join()
        self.response_url_queue.join()
        self.img_list_queue.join()
        self.img_content_queue.join()


img_name = input("输入要下载的图片名称：")
list_number = int(input("下载数量(自动补齐成30的倍数):"))
img = ImgSpider(img_name, list_number)
img.run()