python爬取百度图片用作训练
1.根据文件列表list.txt的条目在百度爬取图片
# coding=utf-8
"""根据搜索词下载百度图片"""
import re
import sys
import urllib
import requests
def getPage(keyword,page,n):
page=page*n
keyword=urllib.parse.quote(keyword, safe='/')
url_begin= "http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word="
url = url_begin+ keyword + "&pn=" +str(page) + "&gsm="+str(hex(page))+"&ct=&ic=0&lm=-1&width=0&height=0"
return url
def get_onepage_urls(onepageurl):
try:
html = requests.get(onepageurl).text
except Exception as e:
print(e)
pic_urls = []
return pic_urls
pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
return pic_urls
def down_pic(pic_urls):
"""给出图片链接列表, 下载所有图片"""
for i, pic_url in enumerate(pic_urls):
try:
pic = requests.get(pic_url, timeout=15)
string =str(i + 1) + '.jpg'
with open(string, 'wb') as f:
f.write(pic.content)
print('成功下载第%s张图片: %s' % (str(i + 1), str(pic_url)))
except Exception as e:
print('下载第%s张图片时失败: %s' % (str(i + 1), str(pic_url)))
print(e)
continue
if __name__ == '__main__':
#keyword = '玫瑰花' # 关键词, 改为你想输入的词即可, 相当于在百度图片里搜索一样
page_begin=0
page_number=30
image_number=0
all_pic_urls = []
file='/Users/didi/lhq/pic/category.txt'
with open(file) as f:
for line in f.readlines():
print(line)
#提取名字
keyword=line.strip()
mkdir(os.path.join('/Users/home/pic/',str(keyword)))
os.chdir('/Users/home/pic/'+keyword)
page_begin=0
while 1:
if page_begin>image_number:
break
print("第%d次请求数据",[page_begin])
print(keyword)
url=getPage(keyword,page_begin,page_number)
onepage_urls= get_onepage_urls(url)
page_begin += 1
all_pic_urls.extend(onepage_urls)
print("start downloading~~~")
down_pic(list(set(all_pic_urls)))
all_pic_urls=[]
print('finished!')
2.爬取的图片应该会有不合格打不开的情况,放到上层目录下个依次删除
from PIL import Image
import os
# just for unit test
if __name__ == '__main__':
badFilesList = []
curDir = '.'
for root, dirs, files in os.walk(curDir):
# print(files)
# 检查当前目录中的损坏的图片文件
for each in files:
# for each in os.listdir('./'):
if each.endswith('.png') or each.endswith('.jpg') or each.endswith('.gif') or each.endswith(
'.JPG') or each.endswith('.PNG') or each.endswith('.GIF') or each.endswith(
'.jpeg') or each.endswith(
'.JPEG'):
# print(each)
try:
im = Image.open(os.path.join(root, each))
# im.show()
except Exception as e:
print('Bad file:', os.path.join(root, each))
badFilesList.append(os.path.join(root, each))
# 删除损坏的文件
if len(badFilesList) != 0:
for each in badFilesList:
try:
os.remove(each)
except Exception as e:
print('Del file: %s failed, %s' % (each, e))
pass