代码如下,python2
或python3
应该皆可运行。
#-*-coding:utf-8-*-
# @meta: download photos from baidu by input keyword, to label photos with classes, like tree, building, mountain, grass, lake
import requests
from bs4 import BeautifulSoup
import sys
import os
import codecs
import re
if '2.7' in sys.version:
from urllib import quote
else:
from urllib.parse import quote
path = sys.path[0] + os.sep
pic_folder = path + 'pics' + os.sep
def down_pic(pic_url, folder, i):
"""下载图片"""
try:
fn = str(i + 1) + '.jpg'
if os.path.exists(folder + fn):
return
pic = requests.get(pic_url, timeout=15)
with open(folder + fn, 'wb') as f:
f.write(pic.content)
if (i + 1) % 10 == 0:
print('成功下载第%s张图片: %s' % (str(i + 1), str(pic_url)))
except:
pass
def get_onepage_urls(onepageurl):
"""获取单个翻页的所有图片的urls+当前翻页的下一翻页的url"""
if not onepageurl:
print('已到最后一页, 结束')
return [], ''
try:
html = requests.get(onepageurl)
html.encoding = 'utf-8'
html = html.text
except Exception as e:
print(e)
pic_urls = []
next_page_url = ''
return pic_urls, next_page_url
pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
next_page_urls = re.findall(re.compile(r'<a href="(.*)" class="n">下一页</a>'), html, flags=0)
next_page_url = 'http://image.baidu.com' + next_page_urls[0] if next_page_urls else ''
return pic_urls, next_page_url
def main(keyword, pages):
folder = pic_folder + keyword + os.sep # 将照片根据关键词保存到相应文件夹
if not os.path.exists(folder):
os.mkdir(folder)
url_init_first = r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word='
url_init = url_init_first + quote(keyword, safe='/')
all_pic_urls = []
onepage_urls, next_page_url = get_onepage_urls(url_init)
all_pic_urls.extend(onepage_urls)
page = 0 # 累计翻页数
while page < pages:
onepage_urls, next_page_url = get_onepage_urls(next_page_url)
page += 1
print('第%s页' % str(page))
if next_page_url == '' and onepage_urls == []:
break
all_pic_urls.extend(onepage_urls)
all_pic_urls = list(set(all_pic_urls))
print ("共获取%s张图片的链接"%len(all_pic_urls))
for i in range(len(all_pic_urls)):
down_pic(all_pic_urls[i], folder, i)
if __name__ == '__main__':
keyword = u'高山' # 关键词, 改为你想输入的词即可, 相当于在百度图片里搜索一样
pages = 20
main(keyword, pages)
以上,欢迎交流 点赞~