因为要训练神经网络,需要很大的数据集,使用爬虫从网上爬取了上万张图片。
此程序参考了爬虫怎么根据一个关键词爬取上千张网络图片,代码如下
#-*- coding:utf-8 -*-
import re
import requests
import traceback
import os
def dowmloadPic(html,keyword,startNum):
kv = {'user-agent':'Mozilla/5.0'}
pic_url = re.findall('"objURL":"(.*?)",',html,re.S)
num = len(pic_url)
i = startNum
root = 'D:/pics/'
print('找到关键词:'+keyword+'的图片,现在开始下载图片...')
for each in pic_url:
print('正在下载第'+str(i+1)+'张图片,图片地址:'+str(each))
path = root + each.split('/')[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
pic = requests.get(each,headers = kv,timeout = 10)
with open(path,'wb') as f:
f.write(pic.content)
f.close()
except:
traceback.print_exc()
print ('【错误】当前图片无法下载')
continue
i += 1
return i
if __name__ == '__main__':
kv = {'user-agent':'Mozilla/5.0'}
lastNum = 0
words = ['keyword1','keyword2','keyword3']
#words为一个列表,可以自动保存多个关键字的图片
for word in words:
# word = input("Input key word: ")
if word.strip() == "exit":
break
pageId = 0
#此处的参数为需爬取的页数
for i in range(2):
url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + "&pn="+str(pageId)+"&gsm=?&ct=&ic=0&lm=-1&width=0&height=0"
pageId += 20
result = requests.get(url,headers = kv)
lastNum = dowmloadPic(result.text, 'keyword', lastNum)