必应图片爬虫

只修改列表list1里面的内容即可

import os
import sys
import time
import urllib
import requests
import re
from bs4 import BeautifulSoup
import time



header = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 UBrowser/6.1.2107.204 Safari/537.36'
}

url = "https://cn.bing.com/images/async?q={0}&first={1}&count={2}&scenario=ImageBasicHover&datsrc=N_I&layout=ColumnBased&mmasync=1&dgState=c*9_y*2226s2180s2072s2043s2292s2295s2079s2203s2094_i*71_w*198&IG=0D6AD6CBAF43430EA716510A4754C951&SFX={3}&iid=images.5599"

def getImage(url, count, plane_name):
    '''从原图url中将原图保存到本地'''
    try:
        time.sleep(0.5)
        urllib.request.urlretrieve(url, './imgs/'+ plane_name +'/'+ plane_name + str(count + 1) + '.jpg')
    except Exception as e:
        time.sleep(1)
        print("本张图片获取异常,跳过...")
    else:
        print("图片+1,成功保存 " + str(count + 1) + " 张图")
def findImgUrlFromHtml(html, rule, url, key, first, loadNum, sfx, count, plane_name):
    '''从缩略图列表页中找到原图的url,并返回这一页的图片数量'''
    soup = BeautifulSoup(html, "lxml")
    link_list = soup.find_all("a", class_="iusc")
    url = []
    for link in link_list:
        result = re.search(rule, str(link))
        #将字符串"amp;"删除
        url = result.group(0)
        #组装完整url
        url = url[8:len(url)]
        #打开高清图片网址
        getImage(url, count, plane_name)
        count += 1
    #完成一页,继续加载下一页
    return count
def getStartHtml(url, key, first, loadNum, sfx):
    '''获取缩略图列表页'''
    page = urllib.request.Request(url.format(key, first, loadNum, sfx),
                                  headers=header)
    html = urllib.request.urlopen(page)
    return html
if __name__ == '__main__':
    list1 = ["风影无人机", "BZK-005无人机", "彩虹4无人机"]

    for plane_name in list1:
        name = plane_name    #图片关键词
        path = './imgs/'+ plane_name +'/'   #图片保存路径
        countNum = 20  #爬取数量
        key = urllib.parse.quote(name)
        first = 1
        loadNum = 35
        sfx = 1
        count = 0
        rule = re.compile(r"\"murl\"\:\"http\S[^\"]+")
        if not os.path.exists(path):
            os.makedirs(path)
        while count < countNum:
            html = getStartHtml(url, key, first, loadNum, sfx)
            count = findImgUrlFromHtml(html, rule, url, key, first, loadNum, sfx,
                                    count, plane_name)
            first = count + 1
            sfx += 1

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值