言归正传上代码(2019/8/9)
import urllib
import requests
import os
import re
import sys
import time
import threading
from datetime import datetime as dt
from multiprocessing.dummy import Pool
from multiprocessing import Queue
from urllib import parse
# 类名与文件名相同
class BaiduImgDownloader(object):
# 解码网址用的映射表
str_table = {
'_z2C$q': ':',
'_z&e3B': '.',
'AzdH3F': '/'
}
char_table = {
'w': 'a',
'k': 'b',
'v': 'c',
'1': 'd',
'j': 'e',
'u': 'f',
'2': 'g',
'i': 'h',
't': 'i',
'3': 'j',
'h': 'k',
's': 'l',
'4': 'm',
'g': 'n',
'5': 'o',
'r': 'p',
'q': 'q',
'6': 'r',
'f': 's',
'p': 't',
'7': 'u',
'e': 'v',
'o': 'w',
'8': '1',
'd': '2',
'n': '3',
'9': '4',
'c': '5',
'm': '6',
'0': '7',
'b': '8',
'l': '9',
'a': '0'
}
# 目标对象URL
re_objURL = re.compile(r'"objURL":"(.*?)".*?"type":"(.*?)"')
re_downNum = re.compile(r"已下载\s(\d+)\s张图片")
# PC浏览器头文件
# 伪装浏览器
headers = {
"User-Agent": "Mozilla/5.1 (Windows NT 10.0; WOW64) AppleWebKit/537.26 (KHTML, like Gecko) Chrome/46.0.2490.71 Safari/537.36",
"Accept-Encoding": "gzip, deflate, sdch",
}
def __init__(self, word, dirpath=None, processNum=30):
self.word = word
self.char_table = {ord(key): ord(value)
for key, value in BaiduImgDownloader.char_table.items()}
# dirpath 这是图片下载保存路径
if not dirpath:
os.mkdir('w:\\image\\%s' % self.word)
# 'w:/image/%s' % self.flie_name + "/" + file_name
dirpath = r'w:/image/%s' % self.word
self.dirpath = dirpath
self.jsonUrlFile = os.path.join(sys.path[0], 'jsonUrl.txt')
self.logFile = os.path.join(sys.path[0], 'logInfo.txt')
self.errorFile = os.path.join(sys.path[0], 'errorUrl.txt')
if os.path.exists(self.errorFile):
os.remove(self.errorFile)
if not os.path.exists(self.dirpath):
os.mkdir(self.dirpath)
self.pool = Pool(30)
self.session = requests.Session()
self.session.headers = BaiduImgDownloader.headers
self.queue = Queue()
self.messageQueue = Queue()
self.index = 0
# 翻页加载图片数量
self.promptNum = 30
self.lock = threading.Lock()
# 初始化 延迟 对象 网络请求太频繁会被封ip
self.delay = 3
self.QUIT = "QUIT"
self.printPrefix = "**"
def start(self):
t = threading.Thread(target=self.__log)
t.setDaemon(True)
t.start()
self.messageQueue.put(self.printPrefix + "脚本开始执行")
start_time = dt.now()
urls = self.__buildUrls()
if len(urls) >= 4000:
print('关键词过于模糊,请重新运行监本输入 例如:泡面 则搜索康师傅泡面')
else:
self.messageQueue.put(self.printPrefix + "已获取 %s 个Json请求网址" % len(urls))
self.pool.map(self.__resolveImgUrl, urls)
while self.queue.qsize():
imgs = self.queue.get()
self.pool.map_async(self.__downImg, imgs)
self.pool.close()
self.pool.join()
self.messageQueue.put(self.printPrefix + "下载完成!已下载 %s 张图片,总用时 %s" %
(self.index, dt.now() - start_time))
self.messageQueue.put(self.printPrefix + "请到 %s 查看结果!" % self.dirpath)
self.messageQueue.put(self.printPrefix + "错误信息保存在 %s" % self.errorFile)
self.messageQueue.put(self.QUIT)
def __log(self):
with open(self.logFile, "w", encoding="utf-8") as f:
while True:
message = self.messageQueue.get()
if message == self.QUIT:
break
message = str(dt.now()) + " " + message
if self.printPrefix in message:
print(message)
elif "已下载" in message:
# 下载N张图片提示一次
downNum = self.re_downNum.findall(message)
if downNum and int(downNum[0]) % self.promptNum == 0:
print(message)
f.write(message + '\n')
f.flush()
def __getIndex(self):
self.lock.acquire()
try:
return self.index
finally:
self.index += 1
self.lock.release()
def decode(self, url):
for key, value in self.str_table.items():
url = url.replace(key, value)
return url.translate(self.char_table)
def __buildUrls(self):
word = urllib.parse.quote(self.word)
url = r"http://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&fp=result&queryWord={word}&cl=2&lm=-1&ie=utf-8&oe=utf-8&st=-1&ic=0&word={word}&face=0&istype=2nc=1&pn={pn}&rn=60"
time.sleep(self.delay)
html = self.session.get(url.format(word=word, pn=0), timeout=20).content.decode('utf-8')
results = re.findall(r'"displayNum":(\d+),', html)
maxNum = int(results[0]) if results else 0
urls = [url.format(word=word, pn=x)
for x in range(0, maxNum + 1, 60)]
with open(self.jsonUrlFile, "w", encoding="utf-8") as f:
for url in urls:
f.write(url + "\n")
return urls
def __resolveImgUrl(self, url):
time.sleep(self.delay)
html = self.session.get(url, timeout=20).content.decode('utf-8')
datas = self.re_objURL.findall(html)
imgs = [Image(self.decode(x[0]), x[1]) for x in datas]
self.messageQueue.put(self.printPrefix + "从网页已解析出 %s 个图片网址" % len(imgs))
self.queue.put(imgs)
def __downImg(self, img):
imgUrl = img.url
try:
time.sleep(self.delay)
res = self.session.get(imgUrl, timeout=20)
message = None
if str(res.status_code)[0] == "4":
message = "\n%s: %s" % (res.status_code, imgUrl)
elif "text/html" in res.headers["Content-Type"]:
message = "\n无法打开图片: %s" % imgUrl
except Exception as e:
message = "\n抛出异常: %s\n%s" % (imgUrl, str(e))
finally:
if message:
self.messageQueue.put(message)
self.__saveError(message)
return
index = self.__getIndex()
# index从0开始
self.messageQueue.put("已下载 %s 张图片:%s" % (index + 1, imgUrl))
filename = os.path.join(self.dirpath, str(index) + "." + img.type)
with open(filename, "wb") as f:
f.write(res.content)
def __saveError(self, message):
self.lock.acquire()
try:
with open(self.errorFile, "a", encoding="utf-8") as f:
f.write(message)
finally:
self.lock.release()
class Image(object):
def __init__(self, url, type):
super(Image, self).__init__()
self.url = url
self.type = type
# 主函数
if __name__ == '__main__':
print("=~+=" * 20)
print('百度图片爬虫监本')
print("=~+=" * 20)
word = input("请输入你要下载的图片关键词:\n")
search_ = word.strip()
start_down = BaiduImgDownloader(search_)
start_down.start()
百度图片反爬难点就在于,当首页加载过来的图片可通过利用正则 objURL 这标签的 图片url 筛选 出来,如果将大量爬取图片时不得不考虑翻页问题,即如用同类方法则会看到以下:
ippr_z2C$qAzdH3FAzdH3Ft42_z&e3B4r_z&e3Btpv_z&e3BvgAzdH3F7rs5w1AzdH3Fda80a989AzdH3Fvll11lavn91j9kvllcdv181ajnl880bk_pi_z
这个实际上是被编译过的 request url 则需要以下方式解析出url:
def code_url(self,imlist):
# 映射对应表
char_table = {
'w': 'a',
'k': 'b',
'v': 'c',
'1': 'd',
'j': 'e',
'u': 'f',
'2': 'g',
'i': 'h',
't': 'i',
'3': 'j',
'h': 'k',
's': 'l',
'4': 'm',
'g': 'n',
'5': 'o',
'r': 'p',
'q': 'q',
'6': 'r',
'f': 's',
'p': 't',
'7': 'u',
'e': 'v',
'o': 'w',
'8': '1',
'd': '2',
'n': '3',
'9': '4',
'c': '5',
'm': '6',
'0': '7',
'b': '8',
'l': '9',
'a': '0'
}
str_table = {
'_z2C$q': ':',
'_z&e3B': '.',
'AzdH3F': '/'
}
for k in imlist:
# print(type(k))
for key, value in str_table.items():
url = k.replace(key, value)
则 爬虫 完成了,可以去爬妹子图片去咯~~~~~略!略!略!