1、实现原理
先导入相关模块
# import webbrowser as web
# import tkinter.messagebox
import re
import os
import requests
# from PIL import Image, ImageTk
import time
import imghdr
1)通过requests获取页面源码,通过re获取图片地址,再通过requests获取图片二进制流并写入本地文件
2)通过imghdr判断下载的文件是否为图片,如果不是,删除,下载下一张图片
2、代码
# import webbrowser as web
# import tkinter.messagebox
import re
import os
import requests
# from PIL import Image, ImageTk
import time
import imghdr
images = []
# 获取图片网址存到数组中
# num是最大图片量,有的图片无法下载,但是网址数量是固定的(没法根据网址判断是否是图片,这里是根据我奇怪的需求改的)
def getImageUrl(keyword, num):
# 有时候图片半天下载不出来,这里用俩个变量做一个逻辑判断,免得耗时间
timeout = 0
timeoutNum = 0
print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print("开始获取图片网址!")
global images
pn = 0
imageNum = 0
# 统计图片网址数量
while imageNum < num:
if imageNum == timeoutNum:
timeout += 1
if timeout > 5:
break
else:
timeoutNum = imageNum
print("pn=" + str(pn) + ",获取了" + str(imageNum) + "张图片")
# 百度图片网址,把关键字和页码弄进去就能用了
# 想起来了,百度翻页得用参数fn来实现,20个图片为一页,每次搜索图片上限是2000张,你说气不气
url = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={}&pn={}'.format(keyword, pn)
try:
html = requests.get(url)
# 正则
urls = re.findall('"objURL":"(.*?)",', html.text, re.S)
except BaseException:
print("出现错误!")
# 翻页
continue
else:
for i in urls:
if i not in images:
images.append(i)
imageNum = len(images)
pn += 20
print("图片路径获取成功!")
downloadImage(keyword)
def downloadImage(keyword):
global images
print("开始创建文件夹")
os.mkdir(keyword)
print("文件夹创建成功,开始下载图片")
print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
imageNum = 0
for url in images:
print("现在进行的是第" + str(imageNum) + "张")
try:
html = requests.get(url, timeout=7)
print("html获取成功,开始下载")
except BaseException:
print("访问失败!")
else:
fileName = keyword + r"\\" + str(imageNum) + ".jpg"
fileOpen = open(fileName, "wb")
fileOpen.write(html.content)
fileOpen.close()
# 检验是图片是否损毁
if imghdr.what(fileName) is None:
print("这个图片已损坏")
os.remove(fileName)
else:
print("图片" + str(imageNum) + "下载完成")
imageNum += 1
print(keyword + "图片下载已完成,开始进行下一项")
images = []
start = time.time()
# 图片数量不可控,但是我就是喜欢这种瞎搞的程序,如果你只是用来下载百度图片,把下面的仓鼠成你需要的关键字就行了,100是图片数量上界
getImageUrl('仓鼠', 100)
print("图片下载成功,共耗时" + str(round(time.time() - start)) + "秒")
3、运行效果(我天,数量不可控,算了,这就是用来下图片的,懒得改)