爬取图片,并按比例划分数据集
上课老师布置的作业,做完保存下来。
(1)批量爬取不同的10个明星的图片各100张;
(2)每个明星的图片放一个文件夹,并按“name_###.jpg”的格式命名,如:liudehua/liudehua_000.jpg;
(3)对每个明星的图片,按7:2:1的比例进行划分train、validation、test数据集,分别将图片名称保存入train.txt、validation.txt、test.txt;
代码实现
# -*- coding: utf-8 -*-
import sys
import os
import re
import uuid
import requests
import random
# 从edge浏览器获得的访问image.baidu.com时的header,可以让网站认为是用户通过浏览器在访问
HEADERS = {
'Accept':'text/html, application/xhtml+xml, image/jxr, */*',
'Accept - Encoding':'gzip, deflate',
'Accept-Language':'zh-Hans-CN, zh-Hans; q=0.5',
'Connection':'Keep-Alive',
'Host':'image.baidu.com',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063'
}
def download_image(key_word, maximum_download, maximum_failure, headers=HEADERS):
"""根据指定的关键词和最大下载数量,从百度图片搜索并下载图片
Args:
key_word: 要搜索的关键词
maximum_download: 最大下载数量
maximum_failure: 最大失败次数
headers: 请求头
"""
download_sum = 0
download_index = 0
failure_sum = 0
str_gsm = '00'
# 把每个明显的图片存放在单独一个文件夹中
save_path = './face_image' + '/' + key_word
if not os.path.exists(save_path):
os.makedirs(save_path)
while download_sum < maximum_download and failure_sum < maximum_failure:
str_pn = str(download_index)
# 定义百度图片的搜索URL
# url = 'http://image.baidu.com/search/index?tn=baiduimage&ie=utf-8&word=%s&pn=%s&gsm=%s&ct=&ic=0&lm=-1&width=0&height=0' % (
url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=%s&pn=%s&gsm=%s&ct=&ic=0&lm=-1&width=0&height=0' % (
key_word, str_pn, str_gsm
)
print("page url: %s" % (url))
try:
# 获取当前页面的源码
result = requests.get(url, timeout=10, headers=headers).text # timeout请求超时时间单位为秒
# 获取当前页面的图片URL
img_urls = re.findall('"objURL":"(.*?)",', result, re.S) #匹配字符串,以列表的形式返回匹配到的字符 re.S参数将这个字符串作为一个整体
if len(img_urls) < 1:
raise ValueError('无法搜索到图片,或URL无效')
break
# 从图片URL中逐个下载
for img_url in img_urls:
# 获取图片内容
# print("image url: %s" % (img_url))
img = requests.get(img_url, timeout=30)
m = str(download_index)
n = m.zfill(3)
img_name = save_path + '/' +key_word+'_'+ n + '.jpg'
# print("img name: %s" % (img_name))
# 保存图片
with open(img_name, 'wb') as f:
f.write(img.content)
# with open('image_url_list.txt', 'a+', encoding='utf-8') as f:
# f.write(img_name + '\t' + img_url + '\n')
download_sum += 1
download_index += 1
# print('第%d张图片%s已下载' % (download_sum, img_name))
if download_sum >= maximum_download:
break
except Exception as e:
print('【错误】当前图片无法下载,%s' % e)
failure_sum += 1
download_index += 1
continue
print('下载完成')
def main():
# 最大图片下载数量和最大允许失败次数
max_download = 100
max_failure = 10
# 输入想搜索的图片的名字
key_word = ['mayun','wangfei','liuxiang','tongliya','luhan','huangbo','zhaobenshan','songxiaobao','liudehua','zhoujielun']
# key_word = str(input('输入想搜索的图片名称: '))
# 使用明星的名字开始下载图片
for i in key_word:
download_image(i, max_download, max_failure)
print('全部图片已下载完成')
for i in key_word:
dataset = []
dataset1 = []
test = []
File_path = './face_image' + '/' + i
for path in os.listdir(File_path):
dataset.append(path)
train = random.sample(dataset,70)
for image_30 in dataset:
if image_30 not in train:
dataset1.append(image_30)
validation= random.sample(dataset1,20)
for image_10 in dataset1:
if image_10 not in validation:
test.append(image_10)
with open('train.txt','a+', encoding='utf-8') as f:
for image_path in train:
f.write(image_path + '\n')
with open('validation.txt','a+', encoding='utf-8') as f:
for image_path in validation:
f.write(image_path + '\n')
with open('test.txt','a+', encoding='utf-8') as f:
for image_path in test:
f.write(image_path + '\n')
print(' 保存完成!')
if __name__ == '__main__':
main()