先看看效果如下图
效果如上
接下来将逐步介绍如何编写代码
1. 查看百度图库不同关键词的对应的链接发现“word=”后面跟着一串奇怪的符号一流浪地球为例符号为%E6%B5%81%E6%B5%AA%E5%9C%B0%E7%90%83
所以可以通过导入urllib.parse包查询关键词所对应的符号
接下来去百度图库查看网页源代码,下面的函数可以下载网页源代码
def open_url(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4843.400 QQBrowser/9.7.13021.400'}
req=urllib.request.Request(url=url,headers=headers)
page = urllib.request.urlopen(req)
html = page.read().decode('utf-8')
#print(html)
return html
如上图所示一张图片有很多URL,每个链接去试一下,发现objURL使我们需要的所以
在网页源代码里找图片地址可以如下
photo_link = r'"objURL":"(http://[^"]*.jpg)"'
imgs = re.findall(photo_link, html)
找到的图片链接是以列表格式储存带imgs里
接下来就可以在for循环里逐个下载图片了,
for each in imgs:
#print(each)
if each.endswith('.jpg'):
img = urllib.request.urlopen(each)
img = img.read()
with open(file_name + '/'+str(i)+'.jpg','wb')as img_file:
img_file.write(img)
#print(i)
i += 1
import urllib.request
import urllib.parse
import ssl
import re
import pymysql
import cv2
import time
import os
i = 0
ssl._create_default_https_context = ssl._create_unverified_context
def open_url(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4843.400 QQBrowser/9.7.13021.400'}
req=urllib.request.Request(url=url,headers=headers)
'''req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0')'''
page = urllib.request.urlopen(req)
html = page.read().decode('utf-8')
#print(html)
return html
def get_img(html, file_name, num):
global i
os.mkdir(file_name)
photo_link = r'"objURL":"(http://[^"]*.jpg)"'
imgs = re.findall(photo_link, html)
last_link = r'<a href="(/search/flip[^"]*)" class="n">下一页</a>'
last = re.findall(last_link, html)
#print(len(imgs))
try:
for each in imgs:
if i > num:
print('下载结束!')
return '---', -1
#print(each)
if each.endswith('.jpg'):
img = urllib.request.urlopen(each)
img = img.read()
with open(file_name + '/'+str(i)+'.jpg','wb')as img_file:
img_file.write(img)
#print(i)
i += 1
except:
#print(last, i)
return last, i
if __name__=="__main__":
key_word = input('请输入要下载的图片:')
num = int(input('请输入要下载图片的数量:'))
print('下载开始... ...')
key_words=urllib.parse.quote(key_word)
baidu = 'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1548421364447_R&pv=&ic=0&nc=1&z=&hd=&latest=©right=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1548421364449%5E00_1351X629&word='
url = baidu + key_words
while True:
try:
last, a = get_img(open_url(url), key_word, num)
url = 'http://image.baidu.com' + last[0]
#print(a)
if a == -1:
break
except:
time.sleep(0.5)
pass