Python爬虫——根据关键词批量下载百度图片

代码如下,python2python3应该皆可运行。

#-*-coding:utf-8-*-
# @meta: download photos from baidu by input keyword, to label photos with classes, like tree, building, mountain, grass, lake
import requests
from bs4 import BeautifulSoup
import sys
import os
import codecs
import re
if '2.7' in sys.version:
    from urllib import quote
else:
    from urllib.parse import quote

path = sys.path[0] + os.sep
pic_folder = path + 'pics' + os.sep

def down_pic(pic_url, folder, i):
    """下载图片"""
    try:
        fn = str(i + 1) + '.jpg'
        if os.path.exists(folder + fn):
            return
        pic = requests.get(pic_url, timeout=15)
        with open(folder + fn, 'wb') as f:
            f.write(pic.content)
            if (i + 1) % 10 == 0:
                print('成功下载第%s张图片: %s' % (str(i + 1), str(pic_url)))
    except:
        pass

def get_onepage_urls(onepageurl):
    """获取单个翻页的所有图片的urls+当前翻页的下一翻页的url"""
    if not onepageurl:
        print('已到最后一页, 结束')
        return [], ''
    try:
        html = requests.get(onepageurl)
        html.encoding = 'utf-8'
        html = html.text
    except Exception as e:
        print(e)
        pic_urls = []
        next_page_url = ''
        return pic_urls, next_page_url
    pic_urls = re.findall('"objURL":"(.*?)",', html, re.S)
    next_page_urls = re.findall(re.compile(r'<a href="(.*)" class="n">下一页</a>'), html, flags=0)
    next_page_url = 'http://image.baidu.com' + next_page_urls[0] if next_page_urls else ''
    return pic_urls, next_page_url

def main(keyword, pages):
    folder = pic_folder + keyword + os.sep  # 将照片根据关键词保存到相应文件夹
    if not os.path.exists(folder):
        os.mkdir(folder)
    url_init_first = r'http://image.baidu.com/search/flip?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1497491098685_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&ctd=1497491098685%5E00_1519X735&word='
    url_init = url_init_first + quote(keyword, safe='/')
    all_pic_urls = []
    onepage_urls, next_page_url = get_onepage_urls(url_init)
    all_pic_urls.extend(onepage_urls)
    page = 0  # 累计翻页数
    while page < pages:
        onepage_urls, next_page_url = get_onepage_urls(next_page_url)
        page += 1
        print('第%s页' % str(page))
        if next_page_url == '' and onepage_urls == []:
            break
        all_pic_urls.extend(onepage_urls)
    all_pic_urls = list(set(all_pic_urls))
    print ("共获取%s张图片的链接"%len(all_pic_urls))
    for i in range(len(all_pic_urls)):
        down_pic(all_pic_urls[i], folder, i)

if __name__ == '__main__':
     keyword = u'高山'  # 关键词, 改为你想输入的词即可, 相当于在百度图片里搜索一样
     pages = 20
     main(keyword, pages)

以上,欢迎交流 点赞~

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值