爬虫:爬取百度图片的小项目

# -*- coding: utf-8 -*-
"""
Created on Sat Jul 21 10:08:39 2018

@author: Administrator
"""

import requests
from urllib import request,parse
from selenium import webdriver
import random
import re
import traceback
import logging
#
#https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E6%B5%B7%E8%B4%BC%E7%8E%8B%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E6%B5%B7%E8%B4%BC%E7%8E%8B%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=300&rn=30
#
#image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E6%B5%B7%E8%B4%BC%E7%8E%8B%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E6%B5%B7%E8%B4%BC%E7%8E%8B%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=300&rn=30

#解析成字典形式
#data=parse.parse_qs('tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E6%B5%B7%E8%B4%BC%E7%8E%8B%E5%9B%BE%E7%89%87&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word=%E6%B5%B7%E8%B4%BC%E7%8E%8B%E5%9B%BE%E7%89%87&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn=300&rn=30')
#
#print(data)
##
#print(bytes(parse.urlencode(data),encoding = 'utf-8'))
#

#设置日志文件
logger = logging.getLogger('baidu_picture')
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
file_handler = logging.FileHandler('baidu_picture.log')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)


#设置代理池
useragentpool = [
        'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
        'Opera/9.80(Macintosh;IntelMacOSX10.6.8;U;en)Presto/2.8.131Version/11.11',
        'Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11',
        'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Trident/4.0;SE2.XMetaSr1.0;SE2.XMetaSr1.0;.NETCLR2.0.50727;SE2.XMetaSr1.0)',
        'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Maxthon2.0)',
        'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;TencentTraveler4.0)',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
        ]


ua_headers = {'User-Agent':random.choice(useragentpool)}


def req_arg(k,pic_name):

    data = {
         'tn': 'resultjson_com', 
         'ipn': 'rj', 
         'ct': '201326592', 
         'fp': 'result', 
         'queryWord': pic_name,
         'cl': '2', 
         'lm': '-1',
         'ie': 'utf-8',
         'oe': 'utf-8',
         'word': pic_name, 
         'pn': k,
         'rn': '30'}
    
    return data

#
#def req_arg(k):
#
#    data = {'tn': 'resultjson_com', 'ipn': 'rj', 'ct': '201326592', 'fp': 'result', 'queryWord': '蕾姆高清', 'cl': '2', 'lm': '-1', 'ie': 'utf-8', 'oe': 'utf-8', 'st': '-1', 'ic': '0', 'word': '蕾姆高清', 'face': '0', 'istype': '3', 'nc': '1', 'pn': k, 'rn': '30'}
#    
    return data


#print(parse.urlencode(data))

#data_arg = bytes(parse.urlencode(data),encoding = 'utf-8')


def cir_par_url():
    #写入默认的的图片
    with open("D:\web crawler爬虫\我的爬虫project\无法显示图片.png",'rb') as f:
        def_pic = f.read()
        
    L_T = []
    
    global pic_name
    pic_name = input("输入你想爬取得图片名字>>>:")
    
    quantity = int(input("输入你要得到的图片的数量(注:只能为三十的倍数)>>>:"))
    
    r_q = quantity//30+1
    
    for i in range(1,r_q):
        
        k = str(i*30)
        
        data_arg = parse.urlencode(req_arg(k,pic_name))
        #
        #print('https://image.baidu.com/search/acjson?'+data_arg)
        
        url = 'https://image.baidu.com/search/acjson?'+data_arg
        
        req = request.Request(url,headers = ua_headers)
        
        response = request.urlopen(req)
        
        info = response.read().decode('utf-8')
        
        
#        pat = re.compile('"hoverURL":"([\s\S]*?)"[\s\S]*?"pageNum"') 
        
        pat = re.compile('"thumbURL":"([\s\S]*?)"[\s\S]*?"middleURL"')
        
#        pat = re.compile('"middleURL":"([\s\S]*?)"[\s\S]*?"largeTnImageUrl"') 
        
        l_u = pat.findall(info)
        
        L_T = l_u + L_T
        
    download(L_T,def_pic)


def download(l_t,def_pic):
    
    i_t = len(l_t)
    i = 1
    
    err_L = []
    
    for u_son in l_t:
        try:
            req = request.Request(u_son,headers = ua_headers)
            
            response = request.urlopen(req).read()
            
            with open("D:\图片\%s-%d.jpg"%(pic_name,i),'wb') as f:
                f.write(response)
                
            
        except:
            err_L.append("第%s张"%i)
            
            with open("D:\图片\%s-%d.jpg"%(pic_name,i),'wb') as f:
                f.write(def_pic)
            
            logger.error('发现错误+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
            logger.error('错误为=====>>:%s'%(traceback.format_exc()))
            
        print("下载进度: ",str(round(i/i_t*100,2))+'%')#设置下载进度
        i+=1
    print('图片下载完毕共%s张图片'%i_t,'下载失误图片为:',err_L)
        

cir_par_url()


logger.removeHandler(file_handler)

#
#print(info)
#
#print(type(info))

#"hoverURL":"https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=1370419985,4280388849&fm=27&gp=0.jpg"
#
#"thumbURL":"https://ss0.bdstatic.com/70cFvHSh_Q1YnxGkpoWK1HF6hhy/it/u=2367471971,4193239205&fm=27&gp=0.jpg"
#
#


 

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值