百度图片爬虫 python3.6 requests

言归正传上代码(2019/8/9)

import urllib
import requests
import os
import re
import sys
import time
import threading
from datetime import datetime as dt
from multiprocessing.dummy import Pool
from multiprocessing import Queue
from urllib import parse


# 类名与文件名相同
class BaiduImgDownloader(object):
    #  解码网址用的映射表
    str_table = {
        '_z2C$q': ':',
        '_z&e3B': '.',
        'AzdH3F': '/'
    }

    char_table = {
        'w': 'a',
        'k': 'b',
        'v': 'c',
        '1': 'd',
        'j': 'e',
        'u': 'f',
        '2': 'g',
        'i': 'h',
        't': 'i',
        '3': 'j',
        'h': 'k',
        's': 'l',
        '4': 'm',
        'g': 'n',
        '5': 'o',
        'r': 'p',
        'q': 'q',
        '6': 'r',
        'f': 's',
        'p': 't',
        '7': 'u',
        'e': 'v',
        'o': 'w',
        '8': '1',
        'd': '2',
        'n': '3',
        '9': '4',
        'c': '5',
        'm': '6',
        '0': '7',
        'b': '8',
        'l': '9',
        'a': '0'
    }
    # 目标对象URL
    re_objURL = re.compile(r'"objURL":"(.*?)".*?"type":"(.*?)"')
    re_downNum = re.compile(r"已下载\s(\d+)\s张图片")
    # PC浏览器头文件
    # 伪装浏览器
    headers = {
        "User-Agent": "Mozilla/5.1 (Windows NT 10.0; WOW64) AppleWebKit/537.26 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36",
        "Accept-Encoding": "gzip, deflate, sdch",
    }

    def __init__(self, word, dirpath=None, processNum=30):
        self.word = word
        self.char_table = {ord(key): ord(value)
                           for key, value in BaiduImgDownloader.char_table.items()}
        # dirpath 这是图片下载保存路径
        if not dirpath:
            os.mkdir('w:\\image\\%s' % self.word)
            # 'w:/image/%s' % self.flie_name + "/" + file_name
            dirpath = r'w:/image/%s' % self.word
        self.dirpath = dirpath
        self.jsonUrlFile = os.path.join(sys.path[0], 'jsonUrl.txt')
        self.logFile = os.path.join(sys.path[0], 'logInfo.txt')
        self.errorFile = os.path.join(sys.path[0], 'errorUrl.txt')
        if os.path.exists(self.errorFile):
            os.remove(self.errorFile)
        if not os.path.exists(self.dirpath):
            os.mkdir(self.dirpath)
        self.pool = Pool(30)
        self.session = requests.Session()
        self.session.headers = BaiduImgDownloader.headers
        self.queue = Queue()
        self.messageQueue = Queue()
        self.index = 0
        # 翻页加载图片数量
        self.promptNum = 30
        self.lock = threading.Lock()
        # 初始化 延迟 对象 网络请求太频繁会被封ip
        self.delay = 3
        self.QUIT = "QUIT"
        self.printPrefix = "**"

    def start(self):
        t = threading.Thread(target=self.__log)
        t.setDaemon(True)
        t.start()
        self.messageQueue.put(self.printPrefix + "脚本开始执行")
        start_time = dt.now()
        urls = self.__buildUrls()
        if len(urls) >= 4000:
            print('关键词过于模糊,请重新运行监本输入 例如:泡面 则搜索康师傅泡面')
        else:
            self.messageQueue.put(self.printPrefix + "已获取 %s 个Json请求网址" % len(urls))
            self.pool.map(self.__resolveImgUrl, urls)
        while self.queue.qsize():
            imgs = self.queue.get()
            self.pool.map_async(self.__downImg, imgs)
        self.pool.close()
        self.pool.join()
        self.messageQueue.put(self.printPrefix + "下载完成!已下载 %s 张图片,总用时 %s" %
                              (self.index, dt.now() - start_time))
        self.messageQueue.put(self.printPrefix + "请到 %s 查看结果!" % self.dirpath)
        self.messageQueue.put(self.printPrefix + "错误信息保存在 %s" % self.errorFile)
        self.messageQueue.put(self.QUIT)

    def __log(self):
        with open(self.logFile, "w", encoding="utf-8") as f:
            while True:
                message = self.messageQueue.get()
                if message == self.QUIT:
                    break
                message = str(dt.now()) + " " + message
                if self.printPrefix in message:
                    print(message)
                elif "已下载" in message:
                    # 下载N张图片提示一次
                    downNum = self.re_downNum.findall(message)
                    if downNum and int(downNum[0]) % self.promptNum == 0:
                        print(message)
                f.write(message + '\n')
                f.flush()

    def __getIndex(self):
        self.lock.acquire()
        try:
            return self.index
        finally:
            self.index += 1
            self.lock.release()

    def decode(self, url):
        for key, value in self.str_table.items():
            url = url.replace(key, value)
        return url.translate(self.char_table)

    def __buildUrls(self):
        word = urllib.parse.quote(self.word)
        url = r"http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&fp=result&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&st=-1&ic=0&word={word}&face=0&istype=2nc=1&pn={pn}&rn=60"
        time.sleep(self.delay)
        html = self.session.get(url.format(word=word, pn=0), timeout=20).content.decode('utf-8')
        results = re.findall(r'"displayNum":(\d+),', html)
        maxNum = int(results[0]) if results else 0
        urls = [url.format(word=word, pn=x)
                for x in range(0, maxNum + 1, 60)]
        with open(self.jsonUrlFile, "w", encoding="utf-8") as f:
            for url in urls:
                f.write(url + "\n")
        return urls

    def __resolveImgUrl(self, url):
        time.sleep(self.delay)
        html = self.session.get(url, timeout=20).content.decode('utf-8')
        datas = self.re_objURL.findall(html)
        imgs = [Image(self.decode(x[0]), x[1]) for x in datas]
        self.messageQueue.put(self.printPrefix + "从网页已解析出 %s 个图片网址" % len(imgs))
        self.queue.put(imgs)

    def __downImg(self, img):
        imgUrl = img.url
        try:
            time.sleep(self.delay)
            res = self.session.get(imgUrl, timeout=20)
            message = None
            if str(res.status_code)[0] == "4":
                message = "\n%s: %s" % (res.status_code, imgUrl)
            elif "text/html" in res.headers["Content-Type"]:
                message = "\n无法打开图片: %s" % imgUrl
        except Exception as e:
            message = "\n抛出异常: %s\n%s" % (imgUrl, str(e))
        finally:
            if message:
                self.messageQueue.put(message)
                self.__saveError(message)
                return

        index = self.__getIndex()
        # index从0开始
        self.messageQueue.put("已下载 %s 张图片:%s" % (index + 1, imgUrl))
        filename = os.path.join(self.dirpath, str(index) + "." + img.type)
        with open(filename, "wb") as f:
            f.write(res.content)

    def __saveError(self, message):
        self.lock.acquire()
        try:
            with open(self.errorFile, "a", encoding="utf-8") as f:
                f.write(message)
        finally:
            self.lock.release()


class Image(object):
    def __init__(self, url, type):
        super(Image, self).__init__()
        self.url = url
        self.type = type

# 主函数
if __name__ == '__main__':
    print("=~+=" * 20)
    print('百度图片爬虫监本')
    print("=~+=" * 20)
    word = input("请输入你要下载的图片关键词:\n")
    search_ = word.strip()
    start_down = BaiduImgDownloader(search_)
    start_down.start()

百度图片反爬难点就在于,当首页加载过来的图片可通过利用正则 objURL 这标签的 图片url 筛选 出来,如果将大量爬取图片时不得不考虑翻页问题,即如用同类方法则会看到以下:

ippr_z2C$qAzdH3FAzdH3Ft42_z&e3B4r_z&e3Btpv_z&e3BvgAzdH3F7rs5w1AzdH3Fda80a989AzdH3Fvll11lavn91j9kvllcdv181ajnl880bk_pi_z

这个实际上是被编译过的 request url 则需要以下方式解析出url:

    def code_url(self,imlist):
        # 映射对应表
        char_table = {
            'w': 'a',
            'k': 'b',
            'v': 'c',
            '1': 'd',
            'j': 'e',
            'u': 'f',
            '2': 'g',
            'i': 'h',
            't': 'i',
            '3': 'j',
            'h': 'k',
            's': 'l',
            '4': 'm',
            'g': 'n',
            '5': 'o',
            'r': 'p',
            'q': 'q',
            '6': 'r',
            'f': 's',
            'p': 't',
            '7': 'u',
            'e': 'v',
            'o': 'w',
            '8': '1',
            'd': '2',
            'n': '3',
            '9': '4',
            'c': '5',
            'm': '6',
            '0': '7',
            'b': '8',
            'l': '9',
            'a': '0'
        }
        str_table = {
            '_z2C$q': ':',
            '_z&e3B': '.',
            'AzdH3F': '/'
        }
        for k in imlist:
            # print(type(k))
            for key, value in str_table.items():
                url = k.replace(key, value)

则 爬虫 完成了,可以去爬妹子图片去咯~~~~~略!略!略!

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值