Python3爬取百度图片

# 根据搜索关键词爬取百度图片
import re
import requests
import urllib
from bs4 import BeautifulSoup
import os
import socket
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
socket.setdefaulttimeout(10)

save_path = "F:baidu_img/" # 图片存储位置
if os.path.exists(save_path)==False:
    os.mkdir(save_path)

def downmloadPicture(keyword,start_page,end_page):
    start_num = int(start_page) # 确定开始爬取的起始页,减1是因为百度图片第一页pn=0
    end_num = int(end_page)
    url='http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn=' # 此为分页版本的百度图片url pn为页码
    
    # 为 urllib.request.urlretrieve设置header
    opener = urllib.request.build_opener()
    opener.addheaders = [('User-agent', 'Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10')]
    urllib.request.install_opener(opener)
    
    if start_num<1:
        print("Error:开始页码需要大于等于1")
    else:
        
        for i in range(start_num,end_num+1):
            x = (i-1)*20  # 百度图片的pn值每一页都比前一页多20
            urlx = url+str(x)
            result= requests.get(urlx)
            pic_url = re.findall('"objURL":"(.*?)",', result.text, re.S)
            print("正在下载第",i,"页:",urlx)
            
            for each in pic_url:
                img_name = each.split("/")[-1:][0]
                try:
                    urllib.request.urlretrieve(each,save_path+img_name)
                except requests.exceptions.ConnectionError:
                    print('【错误】当前图片无法下载,已跳过')
                    continue
                except socket.timeout:
                    count = 1
                    while count<=5:
                        try:
                            urllib.request.urlretrieve(each,save_path+img_name)
                            break
                        except socket.timeout:
                            print('Reloading for %d time'%count if count == 1 else 'Reloading for %d times'%count)
                            count+=1
                    if count>5:
                        print("当前图片下载失败!,已跳过")
                        continue
                except BaseException: # 其他错误
                    print('错误,当前图片无法下载,已跳过')
                    continue
                    
            print("第",i,"页爬取完毕")
        print("关键词:{0}, 第{1}页至第{2}页爬取完毕!".format(keyword,start_num,end_num))
                
if __name__ == '__main__':
    word = input("请输入关键词: ")
    start_page = input("请输入爬取开始页码: ")
    end_page = input("请输入爬去结束页码: ")
    downmloadPicture(word,start_page,end_page)
    print("爬取完毕!")
    
        
    

 

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值