python爬取百度美女图片

首先用chrome浏览器打开百度图片官网,抓包发现url

https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=美女&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=&copyright=&word=美女&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=90&rn=30
queryWord和word是关键字
force为30的倍数

截图如下
在这里插入图片描述
1.单线程爬取

# -*- encoding:utf-8 -*-

"""
@python: 3.7
@Author: xiaobai_IT_learn
@Time: 2019-10-31 10:00
"""
import os
import re
import time
import requests

IMAGE_PATH = './baidu_image'


class BaiduImageSpider(object):
    def __init__(self, key_word):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/77.0.3865.120 Safari/537.36'
        }
        self.key_word = key_word
        self.num = 1
        self._file()

    def _file(self):
        """
        创建文件夹
        :return:
        """
        if not os.path.exists(IMAGE_PATH):
            os.mkdir(IMAGE_PATH)

    def get_url_list(self):
        """
        url列表
        :return:
        """
        url_list = []
        for i in range(30):
            url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result' \
                  '&queryWord={}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=&copyright=&word=美女' \
                  '&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&cg=girl&pn={}&rn=30'\
                    .format(self.key_word, i*30)
            url_list.append(url)
        return url_list

    def spider_baidu_image(self, url):
        """
        爬虫
        :param url:
        :return:
        """
        html = requests.get(url, headers=self.headers)
        html_str = html.content.decode()
        image_list = re.findall(r"\"thumbURL\":\"(.*?)\",\"middleURL\"", html_str)
        print(len(image_list))
        for image_url in image_list:
            try:
                content = requests.get(image_url, headers=self.headers).content
            except Exception as e:
                print(e)
                continue
            file_path = IMAGE_PATH + '/' + str(self.num) + '.jpg'
            with open(file_path, 'wb') as f:
                f.write(content)
            self.num += 1

    def run(self):
        url_list = self.get_url_list()
        for url in url_list:
            self.spider_baidu_image(url)


if __name__ == '__main__':
    key_word = input('输入要查询的关键字:')
    start_time = time.time()
    spider_baidu_image = BaiduImageSpider(key_word)
    spider_baidu_image.run()
    print(time.time() - start_time)

单线程耗时:54.337618589401245,总共629张图片

# -*- encoding:utf-8 -*-

"""
@python: 3.7
@Author: xiaobai_IT_learn
@Time: 2019-10-31 10:00
"""
import os
import re
import threading
import time
from queue import Queue
import requests

IMAGE_PATH = './baidu_image_threading'


class BaiduImageSpider(object):
    def __init__(self, key_word):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/77.0.3865.120 Safari/537.36'
        }
        self.key_word = key_word
        self.url_queue = Queue()
        self.num = 1
        self._file()

    def _file(self):
        """
        创建文件夹
        :return:
        """
        if not os.path.exists(IMAGE_PATH):
            os.mkdir(IMAGE_PATH)

    def get_url_list(self):
        """
        url列表
        :return:
        """
        for i in range(30):
            url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result' \
                  '&queryWord={}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=&copyright=&word={}' \
                  '&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&cg=girl&pn={}&rn=30'\
                    .format(self.key_word, self.key_word, i*30)
            self.url_queue.put(url)

    def spider_baidu_image(self):
        """
        爬虫
        :param url:
        :return:
        """
        while True:
            url = self.url_queue.get()
            html = requests.get(url, headers=self.headers)
            html_str = html.content.decode()
            image_list = re.findall(r"\"thumbURL\":\"(.*?)\",\"middleURL\"", html_str)
            for image_url in image_list:
                try:
                    content = requests.get(image_url, headers=self.headers).content
                except Exception as e:
                    print(e)
                    continue
                file_path = IMAGE_PATH + '/' + str(self.num) + '.jpg'
                print(self.num)
                with open(file_path, 'wb') as f:
                    f.write(content)
                self.num += 1
            self.url_queue.task_done()

    def run(self):
        thread_list = []
        t_url = threading.Thread(target=self.get_url_list, daemon=True)
        thread_list.append(t_url)
        for i in range(3):
            t_spider = threading.Thread(target=self.spider_baidu_image, daemon=True)
            thread_list.append(t_spider)
        for t in thread_list:
            t.start()
        self.url_queue.join()


if __name__ == '__main__':
    key_word = input('输入要查询的关键字:')
    start_time = time.time()
    spider_baidu_image = BaiduImageSpider(key_word)
    spider_baidu_image.run()
    print(time.time() - start_time)

多线程爬取耗时:16.234709978103638,总共629张图片

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值