# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import urllib.request
import requests
import time
import json
import sys
import re
import os
#爬取目标网站url
CRAWL_TARGET_URL = 'https://cn.bing.com/images/async?q=%s&first=%d&count=%d&relp=%d&lostate=r&mmasync=1'
#每次抓取图片数量(35是此网页每次翻页请求数量)
NUMS_PER_CRAWL = 35
#抓取图片最小大小(单位字节),小于此值抛弃
MIN_IMAGE_SIZE = 10
def get_image(url, path, count):
try:
u = urllib.request.urlopen(url, timeout=5)
t = u.read()
if sys.getsizeof(t) < MIN_IMAGE_SIZE:
return -1
except Exception as e:
print(url, e)
return -2
#提取图片格式
frmt = url[url.rfind('.'):]
p = re.compile("^\\.[a-zA-Z]+")
m = p.match(frmt)
frmt = m.group(0)
try:
if not os.path.exists(path):
os.mkdir(path)
f = open(os.path.join(path, str(count)+frmt), 'wb')
f.write(t)
f.close()
except Exception as e:
print(os.path.join(path, str(count)+frmt), e)
return -3
return 0
def crawl_data(info, path, num):
first = 0
count = 0
#创建一个会话
s = requests.Session()
while(count < num):
u = CRAWL_TARGET_URL%(info, first, NUMS_PER_CRAWL, NUMS_PER_CRAWL)
#3.05s为发送超时时间,10s为接收到数据超时时间
req = s.get(url =u, timeout=(3.05, 10))
bf = BeautifulSoup(req.text, "html.parser")
imgtags = bf.find_all("a", class_ = "iusc")
for e in imgtags:
if count == num:
return
urldict = json.loads(e.get('m'))
if get_image(urldict["murl"], path, count) < 0:
continue
print("第%d张图片下载完成,总进度%d%%"%(count+1, (count+1)*100/num))
sys.stdout.flush()
count =count+1
time.sleep(0.01)
first = first + NUMS_PER_CRAWL
time.sleep(0.1)
if __name__ == '__main__':
tstart = time.time()
key_words = ['行李','衣服']
for k in range(len(key_words)):
if os.path.exists('./' + key_words[k])==False:
os.makedirs('./' + key_words[k])
path = './' + key_words[k] + '/'
picture_num = 1000
crawl_data(key_words[k], path, picture_num)
print("所有图片下载完毕,总用时%.2fs"%(time.time()-tstart))
此代码为网上所找,不记得网址了。原作看见请告知。