通过python实现爬取网络图片,并保存到本地文件夹中。
import re
import requests# 请求网页
import traceback
import os
def dowmloadPic(html,keyword,startNum):
headers = {'user-agent':'Mozilla/5.0'}# 浏览器伪装,因为有的网站会反爬虫,通过该headers可以伪装成浏览器访问,否则user-agent中的代理信息为python
pic_url = re.findall('"objURL":"(.*?)",',html,re.S)# 找到符合正则规则的目标网站
num = len(pic_url)
i = startNum
subroot = root + '/' + keyword
txtpath = subroot + '/download_detail.txt'# 图片临时存储文件格式
print('找到关键词:'+keyword+'的图片,现在开始下载图片...')
for each in pic_url:
a = '正在下载第'+str(i+1)+'张图片,图片地址:'+str(each) + '\n'
print(a)
path = subroot + '/' + str(i+1)
try:
if not os.path.exists(subroot):
os.mkdir(subroot)# 新建子文件夹
if not os.path.exists(path):
pic = requests.get(each,headers = headers,timeout = 10)
with open(path+'.jpg','wb') as f:
f.write(pic.content)
f.close()
with open(txtpath,'a') as f:
f.write(a)
f.close()
except:
traceback.print_exc()
print('当前图片无法下载')
continue
i += 1
return i
if __name__ == '__main__':
headers = {'user-agent':'Mozilla/5.0'}
words = input("请输入爬取关键词: ")
root = './爬取的图片,关键词为:'
root = root + words
if not os.path.exists(root):
os.mkdir(root)# 新建文本文件
lastNum = 0
pageId = 0
# 此处的参数为需爬取的页数,设置为20页
for i in range(2):
url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + words + "&pn="+str(pageId)+"&gsm=?&ct=&ic=0&lm=-1&width=0&height=0"
pageId += 20
html = requests.get(url,headers = headers)
lastNum = dowmloadPic(html.text, words, lastNum,)
print('图片下载完成!')