谷歌搜索批量下载高清原图

需要谷歌浏览器和对应版本的驱动。

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import urllib.request
from urllib import error
from urllib import request
import os
import time
import sys

#========================jiaxin===============================
# default url 
# replace for yours
url = "https://www.google.com"
# url = "https://www.bing.com/"
explorer = "Chrome"
# directory
imgs_dir = "./Flag" #图片保存文件夹
downitem = "(印度|india|indian) Army Commander Flag" #搜索关键词,空格是且,|是或
keyname = "Flag_"
startnum  = 0 #图片命名起始编号


def downimg(url,path):
    headers = {"User-Agent": "Chrome"}
    request = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(request)
    img = response.read()
    with open(path, "wb") as f:
        f.write(img)
        print("狗日的,下载成功一张图片")
        return 1

class CrawlSelenium:

    def __init__(self, explorer="Chrome", url="https://www.google.com"):
        self.url = url
        self.explorer = explorer

    def set_loading_strategy(self, strategy="normal"):
        self.options = Options()
        
        self.options.add_argument ('--ignore-certificate-errors')
        self.options.add_argument ('--ignore-ssl-errors')
        self.options.add_argument("--enable-blink-features=PaintTiming")
        self.options.page_load_strategy = strategy
        # self.options.accept_insecure_certs = True


    def crawl(self,downitem):
        # instantiate driver according to corresponding explorer
        if self.explorer == "Chrome":
            self.driver = webdriver.Chrome(options=self.options)
        if self.explorer == "Opera":
            self.driver = webdriver.Opera(options=self.options)
        if self.explorer == "Firefox":
            self.driver = webdriver.Firefox(options=self.options)
        if self.explorer == "Edge":
            self.driver = webdriver.Edge(options=self.options)

        # search on google
        # navigate to url
        self.driver.get(self.url)
        time.sleep(5)
        # locate input field
        search_input = self.driver.find_element(By.NAME, 'q')
        # emulate user input and enter to search
        webdriver.ActionChains(self.driver).move_to_element(search_input).send_keys(downitem + Keys.ENTER).perform()
        time.sleep(5)
        

        # navigate to google image
        # find navigation buttons
        self.driver.find_element(By.LINK_TEXT, '图片').click()
        time.sleep(5)

        # load more images as many as possible
        # scrolling to bottom
        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        time.sleep(5)
        # get button
        show_more_button = self.driver.find_element(By.CSS_SELECTOR, "input[value='显示更多搜索结果']")
        time.sleep(5)
        try:
            while True:
                # do according to message
                message = self.driver.find_element(By.CSS_SELECTOR, 'div.OuJzKb.Bqq24e').get_attribute('textContent')
                # print(message)
                if message == '正在加载更多内容,请稍候':
                    self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
                elif message == '新内容已成功加载。向下滚动即可查看更多内容。':
                    # scrolling to bottom
                    self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
                    if show_more_button.is_displayed():
                        show_more_button.click()
                elif message == '看来您已经看完了所有内容':
                    break
                elif message == '无法加载更多内容,点击即可重试。':
                    show_more_button.click()
                else:
                    self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        except Exception as err:
            print(err)

        # find all image elements in google image result page
        imgs = self.driver.find_elements(By.CSS_SELECTOR, "img.rg_i.Q4LuWd")
        print("获取到图片数量",len(imgs))
        time.sleep(10)

        self.driver.execute_script("window.scrollTo(0, 0);")
        time.sleep(5)
        
        img_count = startnum
        for index, img in enumerate(imgs):
        # for img in imgs:
            try:
                # image per second
                print('\ndownloading image ' + str(img_count) + ': ')
                img_url = img.get_attribute("src")
                src = 0
                try:
                    img.click()
                    print("打开图片成功,兄弟们稍安勿躁,等15秒让图片加载")

                    for i in range(10):
                        try:
                            jiaxin = self.driver.find_elements(By.CSS_SELECTOR, "img.r48jcc.pT0Scc.iPVvYb")
                            src= jiaxin[0].get_attribute("src")
                            if "data:image" not in src and src != 0:
                                img_url = src
                                print("==========原图连接获取成功, 进行更改==========")
                                break
                            else:
                                print("网址没有更新,再等4秒", i)
                                time.sleep(4)
                                # jiaxin = self.driver.find_elements(By.CSS_SELECTOR, "img.r48jcc.pT0Scc.iPVvYb")
                        except:
                            print("页面没有刷新,再等待5秒", i)
                            time.sleep(5)


                    print("连接:",img_url)
                    # time.sleep(5)
                except:
                    print("click点不开,下载原来的")
                    pass

                if img_url == None:
                    print("他妈的,居然空的")
                    continue
                path = os.path.join(imgs_dir, str(keyname)+str(img_count) + "_img.jpg")
                # request.urlretrieve(url = img_url, filename = path, reporthook = progress_callback, data = None)
                su = downimg(url=img_url,path=path)
                if su == 1:
                    img_count = img_count + 1
                    su = 0
                # print("狗日的,终于下载成功一张")
            except error.HTTPError as http_err:
                print(http_err)
            except Exception as err:
                print(err)



def main():
    # setting
    crawl_s = CrawlSelenium(explorer, url)
    crawl_s.set_loading_strategy("normal")
    # make directory
    if not os.path.exists(imgs_dir):
        os.mkdir(imgs_dir)
    # crawling
    crawl_s.crawl(downitem = downitem)


if __name__ == "__main__":
    main()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值