Python 任何百度图片都能批量下载,代码不会报错的那种

# -*- coding: utf-8 -*-
"""
Created on Fri Dec  3 17:52:28 2021

@author: 86176
"""

import re
import requests
import random
import uuid
import urllib.request
import os
import time





# 构建头部,获取页面内容
def Headers():
    header = {
    	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8,application/signed-exchange;v=b3',
    	'accept-encoding': 'gzip,deflate',
    	'Accept-Language': 'zh-CN,zh;q=0.9',
    	'Cache-Control': 'max-age=0',
        }
    
    #防止被封IP
    agnetsList = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
        "Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"
    ]
    agentStr = random.choice(agnetsList)
    header.update({'User-Agent':agentStr})
    return header

    
def get_url(word,frequence):
    # 百度图片这里一页是30张,其它网站另行计算
    miss = 1
    for page in range(0,frequence,30):
        url = f'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&word={word}&pn={page}'
        # print(url)
        # pn代表翻页,每30张图片为1页  0   30   60   90 ....
        headers = Headers()
        
        time.sleep(1) # 暂停一下,别被封IP了
        res = requests.get(url,headers = headers).text
        
        p = r'(http.:[\S]*?.(jpg|jpeg|png|gif|bmp|webp))' # 将图片格式的链接都找出来,并保存后缀名
        imglist = re.findall(p, res)
        for img in imglist:
            # print(img,'\n')
            imgh = img[0]
            # print(img,'\n')
            names = str(uuid.uuid1()).replace("-","") # 随机生成一串乱码
            pattern = img[1] # 后缀名
            filename = os.getcwd() + f'\\{word}\\{names}.{pattern}'
            try:
                urllib.request.urlretrieve(url = imgh, filename = filename)
                print('成功保存!')
            except urllib.request.URLError:
                print('失败了',miss,'次')
                miss += 1
            except ValueError:
                print('不同的失败',miss,'次')
                miss += 1


 
def mkdir(path):
 
	folder = os.path.exists(path)
 
	if not folder:                   #判断是否存在文件夹如果不存在则创建为文件夹
		os.makedirs(path)            #makedirs 创建文件时如果路径不存在会创建这个路径 
		
            
if __name__ == "__main__":
    word = input("请输入您要采集的图片名称:")
    mkdir(word)
    frequence = int(input("请输入您要采集的图片张数:"))
    start = time.time()
    get_url(word,frequence)
    end = time.time()
    print('总共用时:',end - start,'秒')


前提是你得把导入的库下好奥!

(然后每个IP第一次都能运转这个代码,之后次数多了,IP会被百度冻结,这时候代码不报错,但就是爬不了了,换个网能继续用)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值