必应bing搜索下载高清原图

需要谷歌浏览器,以及相对应的版本的驱动,这是前提条件。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options

from urllib import error
from urllib import request
import os
import time
import sys

# by jiaxin
# replace for yours
# url = "https://www.google.com"
url = "https://global.bing.com/?mkt=en-us"
explorer = "Chrome"
# directory
imgs_dir = "./BMP-2_images" #图片保存文件夹
downitem = "(印度|india|indian) BMP-2" #搜索关键词,空格是且,|是或
keyname = "BMP-2_"
startnum  = 24 #图片命名起始编号

# report hook with three parameters passed
# count_of_blocks  The number of blocks transferred
# block_size The size of block
# total_size Total size of the file
def progress_callback(count_of_blocks, block_size, total_size):
    # determine current progress
    progress = int(50 * (count_of_blocks * block_size) / total_size)
    if progress > 50:
        progress = 50
    # update progress bar
    sys.stdout.write("\r[%s%s] %d%%" % ('█' * progress, '  ' * (50 - progress), progress * 2))
    sys.stdout.flush()


class CrawlSelenium:

    def __init__(self, explorer="Chrome", url="https://www.google.com"):
        self.url = url
        self.explorer = explorer

    def set_loading_strategy(self, strategy="normal"):
        self.options = Options()
        self.options.add_argument("--edge-webview-switches=--webview-distribution=canary")
        self.options.add_argument ('--ignore-certificate-errors')
        self.options.add_argument ('--ignore-ssl-errors')
        self.options.page_load_strategy = strategy


    def crawl(self,downitem):
        # instantiate driver according to corresponding explorer
        if self.explorer == "Chrome":
            self.driver = webdriver.Chrome(options=self.options)
        if self.explorer == "Opera":
            self.driver = webdriver.Opera(options=self.options)
        if self.explorer == "Firefox":
            self.driver = webdriver.Firefox(options=self.options)
        if self.explorer == "Edge":
            self.driver = webdriver.Edge(options=self.options)

        # search on google
        # navigate to url
        self.driver.get(self.url)
        time.sleep(15)
        # locate input field
        search_input = self.driver.find_element(By.NAME, 'q')
        time.sleep(3)
        # emulate user input and enter to search
        webdriver.ActionChains(self.driver).move_to_element(search_input).send_keys(downitem + Keys.ENTER).perform()
        time.sleep(3)
        

        # navigate to google image
        # find navigation buttons
        # self.driver.find_element(By.LINK_TEXT, '国际版').click()
        self.driver.find_element(By.LINK_TEXT, '图片').click()
        time.sleep(5)

        # load more images as many as possible
        # scrolling to bottom
        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")

        # 
        time.sleep(3)
        try:
            for i in range(10):
                # do according to message
                # message = self.driver.find_element(By.CSS_SELECTOR, 'div.OuJzKb.Bqq24e').get_attribute('textContent')
                # print(message)
                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
                time.sleep(5)
        except Exception as err:
            print(err)
        # get button
        # show_more_button = self.driver.find_element(By.CSS_SELECTOR, "input[value='查看更多图片']")

        # 临时注释
        try:
            self.driver.find_element(By.LINK_TEXT, '查看更多图片').click()
        except:
            print("找不到更多图片按钮,不是什么错误,只是想多翻点图片")
            pass
        time.sleep(5)
        try:
            for i in range(8):
                # do according to message
                # message = self.driver.find_element(By.CSS_SELECTOR, 'div.OuJzKb.Bqq24e').get_attribute('textContent')
                # print(message)
                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
                time.sleep(5)
        except Exception as err:
            print(err)

        time.sleep(15)
        imgs = self.driver.find_elements(By.CSS_SELECTOR, "img.mimg")
        print("获取到图片数量",len(imgs))
        img_count = startnum
        time.sleep(15)
        for img in imgs:
            try:
                # image per second
                
                print('\ndownloading image ' + str(img_count) + ': ')
                time.sleep(5)
                img_url = img.get_attribute("src")
                try:
                    img.click()
                    print("打开图片成功,兄弟们稍安勿躁,等15秒让图片加载")
                    time.sleep(8)
                    self.driver.switch_to.frame("OverlayIFrame")
                    time.sleep(5)
                    jiaxin = self.driver.find_elements(By.CSS_SELECTOR, "img.nofocus")
                    # url = jiaxin[0].get_attribute("src")
                    
                    for i in jiaxin:
                        try:
                            t= i.get_attribute("src")
                            if 'bing' in t:
                                continue
                            else:
                                img_url = t
                                print("老天保佑,原图连接获取成功")
                        except:
                            print("获取清晰src的时候出错了")
                            continue
                    
                    print("连接:",img_url)
                    # time.sleep(5)


                except:
                    print("点不开,下载原来的")
                    pass
                
                if img_url == None:
                    print("他妈的,居然空的")
                    continue
                path = os.path.join(imgs_dir, str(keyname)+str(img_count) + "_img.jpg")
                request.urlretrieve(url = img_url, filename = path, reporthook = progress_callback, data = None)
                img_count = img_count + 1
                time.sleep(5)
                print("\n狗日的,终于下载成功一张")
                try:
                    self.driver.find_element(By.XPATH, "//div[@class='close nofocus']").click()
                except:
                    pass
                self.driver.switch_to.parent_frame()
            except error.HTTPError as http_err:
                print(http_err)
                try:
                    self.driver.find_element(By.XPATH, "//div[@class='close nofocus']").click()
                except:
                    pass
                self.driver.switch_to.parent_frame()
            except Exception as err:
                print(err)
                try:
                    self.driver.find_element(By.XPATH, "//div[@class='close nofocus']").click()
                except:
                    pass
                self.driver.switch_to.parent_frame()



def main():
    # setting
    crawl_s = CrawlSelenium(explorer, url)
    crawl_s.set_loading_strategy("normal")
    # make directory
    if not os.path.exists(imgs_dir):
        os.mkdir(imgs_dir)
    # crawling
    crawl_s.crawl(downitem = downitem)


if __name__ == "__main__":
    main()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值