# -*- coding: utf-8 -*-
"""
Created on Fri Dec 3 17:52:28 2021
@author: 86176
"""
import re
import requests
import random
import uuid
import urllib.request
import os
import time
# 构建头部,获取页面内容
def Headers():
header = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8,application/signed-exchange;v=b3',
'accept-encoding': 'gzip,deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
}
#防止被封IP
agnetsList = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
"Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36"
]
agentStr = random.choice(agnetsList)
header.update({'User-Agent':agentStr})
return header
def get_url(word,frequence):
# 百度图片这里一页是30张,其它网站另行计算
miss = 1
for page in range(0,frequence,30):
url = f'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&word={word}&pn={page}'
# print(url)
# pn代表翻页,每30张图片为1页 0 30 60 90 ....
headers = Headers()
time.sleep(1) # 暂停一下,别被封IP了
res = requests.get(url,headers = headers).text
p = r'(http.:[\S]*?.(jpg|jpeg|png|gif|bmp|webp))' # 将图片格式的链接都找出来,并保存后缀名
imglist = re.findall(p, res)
for img in imglist:
# print(img,'\n')
imgh = img[0]
# print(img,'\n')
names = str(uuid.uuid1()).replace("-","") # 随机生成一串乱码
pattern = img[1] # 后缀名
filename = os.getcwd() + f'\\{word}\\{names}.{pattern}'
try:
urllib.request.urlretrieve(url = imgh, filename = filename)
print('成功保存!')
except urllib.request.URLError:
print('失败了',miss,'次')
miss += 1
except ValueError:
print('不同的失败',miss,'次')
miss += 1
def mkdir(path):
folder = os.path.exists(path)
if not folder: #判断是否存在文件夹如果不存在则创建为文件夹
os.makedirs(path) #makedirs 创建文件时如果路径不存在会创建这个路径
if __name__ == "__main__":
word = input("请输入您要采集的图片名称:")
mkdir(word)
frequence = int(input("请输入您要采集的图片张数:"))
start = time.time()
get_url(word,frequence)
end = time.time()
print('总共用时:',end - start,'秒')
前提是你得把导入的库下好奥!
(然后每个IP第一次都能运转这个代码,之后次数多了,IP会被百度冻结,这时候代码不报错,但就是爬不了了,换个网能继续用)