百度图片爬虫 python3.6 requests

最新推荐文章于 2021-02-04 05:54:28 发布

AndaLove

最新推荐文章于 2021-02-04 05:54:28 发布

阅读量215

点赞数

本文链接：https://blog.csdn.net/Abc1841010/article/details/98943091

版权

言归正传上代码（2019/8/9）

import urllib
import requests
import os
import re
import sys
import time
import threading
from datetime import datetime as dt
from multiprocessing.dummy import Pool
from multiprocessing import Queue
from urllib import parse


# 类名与文件名相同
class BaiduImgDownloader(object):
    #  解码网址用的映射表
    str_table = {
        '_z2C$q': ':',
        '_z&e3B': '.',
        'AzdH3F': '/'
    }

    char_table = {
        'w': 'a',
        'k': 'b',
        'v': 'c',
        '1': 'd',
        'j': 'e',
        'u': 'f',
        '2': 'g',
        'i': 'h',
        't': 'i',
        '3': 'j',
        'h': 'k',
        's': 'l',
        '4': 'm',
        'g': 'n',
        '5': 'o',
        'r': 'p',
        'q': 'q',
        '6': 'r',
        'f': 's',
        'p': 't',
        '7': 'u',
        'e': 'v',
        'o': 'w',
        '8': '1',
        'd': '2',
        'n': '3',
        '9': '4',
        'c': '5',
        'm': '6',
        '0': '7',
        'b': '8',
        'l': '9',
        'a': '0'
    }
    # 目标对象URL
    re_objURL = re.compile(r'"objURL":"(.*?)".*?"type":"(.*?)"')
    re_downNum = re.compile(r"已下载\s(\d+)\s张图片")
    # PC浏览器头文件
    # 伪装浏览器
    headers = {
        "User-Agent": "Mozilla/5.1 (Windows NT 10.0; WOW64) AppleWebKit/537.26 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36",
        "Accept-Encoding": "gzip, deflate, sdch",
    }

    def __init__(self, word, dirpath=None, processNum=30):
        self.word = word
        self.char_table = {ord(key): ord(value)
                           for key, value in BaiduImgDownloader.char_table.items()}
        # dirpath 这是图片下载保存路径
        if not dirpath:
            os.mkdir('w:\\image\\%s' % self.word)
            # 'w:/image/%s' % self.flie_name + "/" + file_name
            dirpath = r'w:/image/%s' % self.word
        self.dirpath = dirpath
        self.jsonUrlFile = os.path.join(sys.path[0], 'jsonUrl.txt')
        self.logFile = os.path.join(sys.path[0], 'logInfo.txt')
        self.errorFile = os.path.join(sys.path[0], 'errorUrl.txt')
        if os.path.exists(self.errorFile):
            os.remove(self.errorFile)
        if not os.path.exists(self.dirpath):
            os.mkdir(self.dirpath)
        self.pool = Pool(30)
        self.session = requests.Session()
        self.session.headers = BaiduImgDownloader.headers
        self.queue = Queue()
        self.messageQueue = Queue()
        self.index = 0
        # 翻页加载图片数量
        self.promptNum = 30
        self.lock = threading.Lock()
        # 初始化 延迟 对象 网络请求太频繁会被封ip
        self.delay = 3
        self.QUIT = "QUIT"
        self.printPrefix = "**"

    def start(self):
        t = threading.Thread(target=self.__log)
        t.setDaemon(True)
        t.start()
        self.messageQueue.put(self.printPrefix + "脚本开始执行")
        start_time = dt.now()
        urls = self.__buildUrls()
        if len(urls) >= 4000:
            print('关键词过于模糊，请重新运行监本输入 例如：泡面 则搜索康师傅泡面')
        else:
            self.messageQueue.put(self.printPrefix + "已获取 %s 个Json请求网址" % len(urls))
            self.pool.map(self.__resolveImgUrl, urls)
        while self.queue.qsize():
            imgs = self.queue.get()
            self.pool.map_async(self.__downImg, imgs)
        self.pool.close()
        self.pool.join()
        self.messageQueue.put(self.printPrefix + "下载完成！已下载 %s 张图片，总用时 %s" %
                              (self.index, dt.now() - start_time))
        self.messageQueue.put(self.printPrefix + "请到 %s 查看结果！" % self.dirpath)
        self.messageQueue.put(self.printPrefix + "错误信息保存在 %s" % self.errorFile)
        self.messageQueue.put(self.QUIT)

    def __log(self):
        with open(self.logFile, "w", encoding="utf-8") as f:
            while True:
                message = self.messageQueue.get()
                if message == self.QUIT:
                    break
                message = str(dt.now()) + " " + message
                if self.printPrefix in message:
                    print(message)
                elif "已下载" in message:
                    # 下载N张图片提示一次
                    downNum = self.re_downNum.findall(message)
                    if downNum and int(downNum[0]) % self.promptNum == 0:
                        print(message)
                f.write(message + '\n')
                f.flush()

    def __getIndex(self):
        self.lock.acquire()
        try:
            return self.index
        finally:
            self.index += 1
            self.lock.release()

    def decode(self, url):
        for key, value in self.str_table.items():
            url = url.replace(key, value)
        return url.translate(self.char_table)

    def __buildUrls(self):
        word = urllib.parse.quote(self.word)
        url = r"http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&fp=result&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&st=-1&ic=0&word={word}&face=0&istype=2nc=1&pn={pn}&rn=60"
        time.sleep(self.delay)
        html = self.session.get(url.format(word=word, pn=0), timeout=20).content.decode('utf-8')
        results = re.findall(r'"displayNum":(\d+),', html)
        maxNum = int(results[0]) if results else 0
        urls = [url.format(word=word, pn=x)
                for x in range(0, maxNum + 1, 60)]
        with open(self.jsonUrlFile, "w", encoding="utf-8") as f:
            for url in urls:
                f.write(url + "\n")
        return urls

    def __resolveImgUrl(self, url):
        time.sleep(self.delay)
        html = self.session.get(url, timeout=20).content.decode('utf-8')
        datas = self.re_objURL.findall(html)
        imgs = [Image(self.decode(x[0]), x[1]) for x in datas]
        self.messageQueue.put(self.printPrefix + "从网页已解析出 %s 个图片网址" % len(imgs))
        self.queue.put(imgs)

    def __downImg(self, img):
        imgUrl = img.url
        try:
            time.sleep(self.delay)
            res = self.session.get(imgUrl, timeout=20)
            message = None
            if str(res.status_code)[0] == "4":
                message = "\n%s： %s" % (res.status_code, imgUrl)
            elif "text/html" in res.headers["Content-Type"]:
                message = "\n无法打开图片： %s" % imgUrl
        except Exception as e:
            message = "\n抛出异常： %s\n%s" % (imgUrl, str(e))
        finally:
            if message:
                self.messageQueue.put(message)
                self.__saveError(message)
                return

        index = self.__getIndex()
        # index从0开始
        self.messageQueue.put("已下载 %s 张图片：%s" % (index + 1, imgUrl))
        filename = os.path.join(self.dirpath, str(index) + "." + img.type)
        with open(filename, "wb") as f:
            f.write(res.content)

    def __saveError(self, message):
        self.lock.acquire()
        try:
            with open(self.errorFile, "a", encoding="utf-8") as f:
                f.write(message)
        finally:
            self.lock.release()


class Image(object):
    def __init__(self, url, type):
        super(Image, self).__init__()
        self.url = url
        self.type = type

# 主函数
if __name__ == '__main__':
    print("=~+=" * 20)
    print('百度图片爬虫监本')
    print("=~+=" * 20)
    word = input("请输入你要下载的图片关键词：\n")
    search_ = word.strip()
    start_down = BaiduImgDownloader(search_)
    start_down.start()

百度图片反爬难点就在于，当首页加载过来的图片可通过利用正则 objURL 这标签的图片url 筛选出来，如果将大量爬取图片时不得不考虑翻页问题，即如用同类方法则会看到以下：

ippr_z2C$qAzdH3FAzdH3Ft42_z&e3B4r_z&e3Btpv_z&e3BvgAzdH3F7rs5w1AzdH3Fda80a989AzdH3Fvll11lavn91j9kvllcdv181ajnl880bk_pi_z

这个实际上是被编译过的 request url 则需要以下方式解析出url：

    def code_url(self,imlist):
        # 映射对应表
        char_table = {
            'w': 'a',
            'k': 'b',
            'v': 'c',
            '1': 'd',
            'j': 'e',
            'u': 'f',
            '2': 'g',
            'i': 'h',
            't': 'i',
            '3': 'j',
            'h': 'k',
            's': 'l',
            '4': 'm',
            'g': 'n',
            '5': 'o',
            'r': 'p',
            'q': 'q',
            '6': 'r',
            'f': 's',
            'p': 't',
            '7': 'u',
            'e': 'v',
            'o': 'w',
            '8': '1',
            'd': '2',
            'n': '3',
            '9': '4',
            'c': '5',
            'm': '6',
            '0': '7',
            'b': '8',
            'l': '9',
            'a': '0'
        }
        str_table = {
            '_z2C$q': ':',
            '_z&e3B': '.',
            'AzdH3F': '/'
        }
        for k in imlist:
            # print(type(k))
            for key, value in str_table.items():
                url = k.replace(key, value)

则爬虫完成了，可以去爬妹子图片去咯~~~~~略！略！略！

AndaLove

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
百度图片爬虫 python3.6 requests

言归正传上代码（2019/8/9）import urllibimport requestsimport osimport reimport sysimport timeimport threadingfrom datetime import datetime as dtfrom multiprocessing.dummy import Poolfrom multiproce...
复制链接

扫一扫