# -*- coding: utf-8 -*-
"""
Created on Sat Jul 21 10:08:39 2018
@author: Administrator
"""
import requests
from urllib import request,parse
from selenium import webdriver
import random
import re
import traceback
import logging
#
#https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E6%B5%B7%E8%B4%BC%E7%8E%8B%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E6%B5%B7%E8%B4%BC%E7%8E%8B%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=300&rn=30
#
#image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E6%B5%B7%E8%B4%BC%E7%8E%8B%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E6%B5%B7%E8%B4%BC%E7%8E%8B%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=300&rn=30
#解析成字典形式
#data=parse.parse_qs('tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E6%B5%B7%E8%B4%BC%E7%8E%8B%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E6%B5%B7%E8%B4%BC%E7%8E%8B%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=300&rn=30')
#
#print(data)
##
#print(bytes(parse.urlencode(data),encoding = 'utf-8'))
#
#设置日志文件
logger = logging.getLogger('baidu_picture')
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
file_handler = logging.FileHandler('baidu_picture.log')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
#设置代理池
useragentpool = [
'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
'Opera/9.80(Macintosh;IntelMacOSX10.6.8;U;en)Presto/2.8.131Version/11.11',
'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11',
'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Trident/4.0;SE2.XMetaSr1.0;SE2.XMetaSr1.0;.NETCLR2.0.50727;SE2.XMetaSr1.0)',
'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Maxthon2.0)',
'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;TencentTraveler4.0)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
]
ua_headers = {'User-Agent':random.choice(useragentpool)}
def req_arg(k,pic_name):
data = {
'tn': 'resultjson_com',
'ipn': 'rj',
'ct': '201326592',
'fp': 'result',
'queryWord': pic_name,
'cl': '2',
'lm': '-1',
'ie': 'utf-8',
'oe': 'utf-8',
'word': pic_name,
'pn': k,
'rn': '30'}
return data
#
#def req_arg(k):
#
# data = {'tn': 'resultjson_com', 'ipn': 'rj', 'ct': '201326592', 'fp': 'result', 'queryWord': '蕾姆高清', 'cl': '2', 'lm': '-1', 'ie': 'utf-8', 'oe': 'utf-8', 'st': '-1', 'ic': '0', 'word': '蕾姆高清', 'face': '0', 'istype': '3', 'nc': '1', 'pn': k, 'rn': '30'}
#
return data
#print(parse.urlencode(data))
#data_arg = bytes(parse.urlencode(data),encoding = 'utf-8')
def cir_par_url():
#写入默认的的图片
with open("D:\web crawler爬虫\我的爬虫project\无法显示图片.png",'rb') as f:
def_pic = f.read()
L_T = []
global pic_name
pic_name = input("输入你想爬取得图片名字>>>:")
quantity = int(input("输入你要得到的图片的数量(注:只能为三十的倍数)>>>:"))
r_q = quantity//30+1
for i in range(1,r_q):
k = str(i*30)
data_arg = parse.urlencode(req_arg(k,pic_name))
#
#print('https://image.baidu.com/search/acjson?'+data_arg)
url = 'https://image.baidu.com/search/acjson?'+data_arg
req = request.Request(url,headers = ua_headers)
response = request.urlopen(req)
info = response.read().decode('utf-8')
# pat = re.compile('"hoverURL":"([\s\S]*?)"[\s\S]*?"pageNum"')
pat = re.compile('"thumbURL":"([\s\S]*?)"[\s\S]*?"middleURL"')
# pat = re.compile('"middleURL":"([\s\S]*?)"[\s\S]*?"largeTnImageUrl"')
l_u = pat.findall(info)
L_T = l_u + L_T
download(L_T,def_pic)
def download(l_t,def_pic):
i_t = len(l_t)
i = 1
err_L = []
for u_son in l_t:
try:
req = request.Request(u_son,headers = ua_headers)
response = request.urlopen(req).read()
with open("D:\图片\%s-%d.jpg"%(pic_name,i),'wb') as f:
f.write(response)
except:
err_L.append("第%s张"%i)
with open("D:\图片\%s-%d.jpg"%(pic_name,i),'wb') as f:
f.write(def_pic)
logger.error('发现错误+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
logger.error('错误为=====>>:%s'%(traceback.format_exc()))
print("下载进度: ",str(round(i/i_t*100,2))+'%')#设置下载进度
i+=1
print('图片下载完毕共%s张图片'%i_t,'下载失误图片为:',err_L)
cir_par_url()
logger.removeHandler(file_handler)
#
#print(info)
#
#print(type(info))
#"hoverURL":"https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=1370419985,4280388849&fm=27&gp=0.jpg"
#
#"thumbURL":"https://ss0.bdstatic.com/70cFvHSh_Q1YnxGkpoWK1HF6hhy/it/u=2367471971,4193239205&fm=27&gp=0.jpg"
#
#