# 根据搜索关键词爬取百度图片
import re
import requests
import urllib
from bs4 import BeautifulSoup
import os
import socket
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
socket.setdefaulttimeout(10)
save_path = "F:baidu_img/" # 图片存储位置
if os.path.exists(save_path)==False:
os.mkdir(save_path)
def downmloadPicture(keyword,start_page,end_page):
start_num = int(start_page) # 确定开始爬取的起始页,减1是因为百度图片第一页pn=0
end_num = int(end_page)
url='http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn=' # 此为分页版本的百度图片url pn为页码
# 为 urllib.request.urlretrieve设置header
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10')]
urllib.request.install_opener(opener)
if start_num<1:
print("Error:开始页码需要大于等于1")
else:
for i in range(start_num,end_num+1):
x = (i-1)*20 # 百度图片的pn值每一页都比前一页多20
urlx = url+str(x)
result= requests.get(urlx)
pic_url = re.findall('"objURL":"(.*?)",', result.text, re.S)
print("正在下载第",i,"页:",urlx)
for each in pic_url:
img_name = each.split("/")[-1:][0]
try:
urllib.request.urlretrieve(each,save_path+img_name)
except requests.exceptions.ConnectionError:
print('【错误】当前图片无法下载,已跳过')
continue
except socket.timeout:
count = 1
while count<=5:
try:
urllib.request.urlretrieve(each,save_path+img_name)
break
except socket.timeout:
print('Reloading for %d time'%count if count == 1 else 'Reloading for %d times'%count)
count+=1
if count>5:
print("当前图片下载失败!,已跳过")
continue
except BaseException: # 其他错误
print('错误,当前图片无法下载,已跳过')
continue
print("第",i,"页爬取完毕")
print("关键词:{0}, 第{1}页至第{2}页爬取完毕!".format(keyword,start_num,end_num))
if __name__ == '__main__':
word = input("请输入关键词: ")
start_page = input("请输入爬取开始页码: ")
end_page = input("请输入爬去结束页码: ")
downmloadPicture(word,start_page,end_page)
print("爬取完毕!")