python爬虫爬取百度图片

爬虫爬取百度图片

因公司业务需要,而且公司人手不足,我这个测试工程师需要临时客串一下其他职位,所以,由我来爬取百度图片。

说明

1、最近稍微有点儿忙,没顾得上整理。而且代码量比较少,所以注释比较少。
2、如果需要直接使用我的代码,请将相应路径文件名称更改。具体使用方法我会在下面代码中详细介绍。
3.python2.7

实现思路及功能

1.读取excel中第一列的关键词,保存在列表中,等待遍历
2.根据关键词开启线程
3.将关键词传入img中,开始获取图片
4.将图片保存在指定目录

上代码

#__author__ = 'chubbysuperman'
#_*_coding=utf-8 _*_
import requests
from fake_useragent import UserAgent
import xlrd
from compiler.ast import flatten
import os
import time
import threading
def imgUrls(keyWord, userAgent, pn):
    url = 'https://image.baidu.com/search/index'
    params = {'tn': 'resultjson_com', 'ipn': 'rj', 'ct': '201326592', 'is': '', 'fp': 'result', 'queryWord': keyWord, 'cl': '2', 'lm': '-1', 'ie': 'utf-8', 'oe': 'utf-8', 'adpicid': '', 'st': '-1', 'z': '', 'ic': '0', 'word': keyWord, 's': '', 'se': '', 'tab': '', 'width': '', 'height': '', 'face': '0', 'istype': '2', 'qc': '', 'nc': '1', 'fr': '', 'pn': pn,  'rn': 200, 'gsm': '1e', '1491808945838': '' }
    rep = requests.get(url, headers={'user-Agent': userAgent}, params=params,timeout=(4,7))
    if(int(rep.status_code) == 200):
        try:
            time.sleep(1)
            imgs = rep.json()
            def decodeUrl(imgUrl):
                longDic={'_z2C$q': ":",'_z&e3B': ".",'AzdH3F': "/"}
                mapDic={'w': "a",'k': "b",'v': "c",'1': "d",'j': "e",'u': "f",'2': "g",'i': "h",'t': "i",'3': "j",'h': "k",'s': "l",'4': "m",'g': "n","5": "o",'r': "p",'q': "q","6": "r",'f': "s",'p': "t","7": "u",'e': "v",'o': "w","8": "1",'d': "2",'n': "3","9": "4",'c': "5",'m': "6","0": "7",'b': "8",'l': "9",'a': "0"}
                for k in longDic:
                    imgUrl=imgUrl.replace(k,longDic[k])
                imgUrl=list(imgUrl)
                tmp=[]
                for i in imgUrl:
                    if i in mapDic:
                        tmp.append(mapDic[i])
                    else:
                        tmp.append(i)
                return ''.join(tmp)
            imgUrls = [decodeUrl(imgs['data'][sec]['objURL'])
                       for sec in range(len(imgs['data']) - 1)]
            result = imgUrls
            status = True
        except Exception as e:
            result = 'wuyunlunbi'
            status = False
        finally:
            return {'result': result, 'status': status}
def img(keyWord, userAgent):
    add1=[]
    for i in range(100):
        #print(imgUrls(keyWord,userAgent,pn=i*20)['result'])
        add1.append(imgUrls(keyWord,userAgent,pn=i*20)['result'])
    add1 = flatten(add1)
    #return {keyWord:add1}
    x = keyWord
    print(len(add1))
    print(add1)
    os.makedirs(r'D:\yyyyy5\%s'%x)
    #创建存储目录
    for iii in range(len(add1)):
        print(iii)
        iii = add1[iii]
        iii = iii.replace(" ","")
        time.sleep(0.15)
        if 'wuyunlunbi' in iii:
            print('error_%s'%iii)
        elif "yuan_" in iii:
            print("error001_%s"%iii)
        else:
            try:
                a = requests.get('%s'%iii,timeout=(3,4))
                img = a.content
                ccc =time.time()
                time.sleep(0.15)
                asdf = 'D:/yyyyy5/%s/%s.jpg'%(x,ccc)
                #将图片写入指定目录
                with open( asdf,'wb' ) as f:
                    f.write(img)
            except Exception as e:
                pass
if __name__ == '__main__':
    workbook = xlrd.open_workbook(r'C:\Users\Administrator\Desktop\Ashicai (2).xlsx')
    #这是关键词存储的excel,请将关键词放在第一个sheet中的第一列
    a = workbook.sheet_by_index(0).col_values(0)
    ua = UserAgent()
    urls = []
    for x in range(len(a)):
        aa = time.time()
        threading.Thread(target=img,args=(a[x],ua.random)).start()
        time.sleep(0.05)
        #urls.append(img(keyWord=a[x], userAgent=ua.random))
        print(aa)


  • 2
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值