谷歌搜索批量下载高清原图

最新推荐文章于 2024-07-25 18:55:58 发布
夹心饼干1100
最新推荐文章于 2024-07-25 18:55:58 发布
阅读量121
点赞数
文章标签： python
本文链接：https://blog.csdn.net/weixin_44646605/article/details/132146761
版权
需要谷歌浏览器和对应版本的驱动。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import urllib.request
from urllib import error
from urllib import request
import os
import time
import sys

#========================jiaxin===============================
# default url 
# replace for yours
url = "https://www.google.com"
# url = "https://www.bing.com/"
explorer = "Chrome"
# directory
imgs_dir = "./Flag" #图片保存文件夹
downitem = "(印度|india|indian) Army Commander Flag" #搜索关键词，空格是且，|是或
keyname = "Flag_"
startnum  = 0 #图片命名起始编号


def downimg(url,path):
    headers = {"User-Agent": "Chrome"}
    request = urllib.request.Request(url, headers=headers)
    response = urllib.request.urlopen(request)
    img = response.read()
    with open(path, "wb") as f:
        f.write(img)
        print("狗日的，下载成功一张图片")
        return 1

class CrawlSelenium:

    def __init__(self, explorer="Chrome", url="https://www.google.com"):
        self.url = url
        self.explorer = explorer

    def set_loading_strategy(self, strategy="normal"):
        self.options = Options()
        
        self.options.add_argument ('--ignore-certificate-errors')
        self.options.add_argument ('--ignore-ssl-errors')
        self.options.add_argument("--enable-blink-features=PaintTiming")
        self.options.page_load_strategy = strategy
        # self.options.accept_insecure_certs = True


    def crawl(self,downitem):
        # instantiate driver according to corresponding explorer
        if self.explorer == "Chrome":
            self.driver = webdriver.Chrome(options=self.options)
        if self.explorer == "Opera":
            self.driver = webdriver.Opera(options=self.options)
        if self.explorer == "Firefox":
            self.driver = webdriver.Firefox(options=self.options)
        if self.explorer == "Edge":
            self.driver = webdriver.Edge(options=self.options)

        # search on google
        # navigate to url
        self.driver.get(self.url)
        time.sleep(5)
        # locate input field
        search_input = self.driver.find_element(By.NAME, 'q')
        # emulate user input and enter to search
        webdriver.ActionChains(self.driver).move_to_element(search_input).send_keys(downitem + Keys.ENTER).perform()
        time.sleep(5)
        

        # navigate to google image
        # find navigation buttons
        self.driver.find_element(By.LINK_TEXT, '图片').click()
        time.sleep(5)

        # load more images as many as possible
        # scrolling to bottom
        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        time.sleep(5)
        # get button
        show_more_button = self.driver.find_element(By.CSS_SELECTOR, "input[value='显示更多搜索结果']")
        time.sleep(5)
        try:
            while True:
                # do according to message
                message = self.driver.find_element(By.CSS_SELECTOR, 'div.OuJzKb.Bqq24e').get_attribute('textContent')
                # print(message)
                if message == '正在加载更多内容，请稍候':
                    self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
                elif message == '新内容已成功加载。向下滚动即可查看更多内容。':
                    # scrolling to bottom
                    self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
                    if show_more_button.is_displayed():
                        show_more_button.click()
                elif message == '看来您已经看完了所有内容':
                    break
                elif message == '无法加载更多内容，点击即可重试。':
                    show_more_button.click()
                else:
                    self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        except Exception as err:
            print(err)

        # find all image elements in google image result page
        imgs = self.driver.find_elements(By.CSS_SELECTOR, "img.rg_i.Q4LuWd")
        print("获取到图片数量",len(imgs))
        time.sleep(10)

        self.driver.execute_script("window.scrollTo(0, 0);")
        time.sleep(5)
        
        img_count = startnum
        for index, img in enumerate(imgs):
        # for img in imgs:
            try:
                # image per second
                print('\ndownloading image ' + str(img_count) + ': ')
                img_url = img.get_attribute("src")
                src = 0
                try:
                    img.click()
                    print("打开图片成功，兄弟们稍安勿躁，等15秒让图片加载")

                    for i in range(10):
                        try:
                            jiaxin = self.driver.find_elements(By.CSS_SELECTOR, "img.r48jcc.pT0Scc.iPVvYb")
                            src= jiaxin[0].get_attribute("src")
                            if "data:image" not in src and src != 0:
                                img_url = src
                                print("==========原图连接获取成功, 进行更改==========")
                                break
                            else:
                                print("网址没有更新，再等4秒", i)
                                time.sleep(4)
                                # jiaxin = self.driver.find_elements(By.CSS_SELECTOR, "img.r48jcc.pT0Scc.iPVvYb")
                        except:
                            print("页面没有刷新，再等待5秒", i)
                            time.sleep(5)


                    print("连接：",img_url)
                    # time.sleep(5)
                except:
                    print("click点不开，下载原来的")
                    pass

                if img_url == None:
                    print("他妈的，居然空的")
                    continue
                path = os.path.join(imgs_dir, str(keyname)+str(img_count) + "_img.jpg")
                # request.urlretrieve(url = img_url, filename = path, reporthook = progress_callback, data = None)
                su = downimg(url=img_url,path=path)
                if su == 1:
                    img_count = img_count + 1
                    su = 0
                # print("狗日的，终于下载成功一张")
            except error.HTTPError as http_err:
                print(http_err)
            except Exception as err:
                print(err)



def main():
    # setting
    crawl_s = CrawlSelenium(explorer, url)
    crawl_s.set_loading_strategy("normal")
    # make directory
    if not os.path.exists(imgs_dir):
        os.mkdir(imgs_dir)
    # crawling
    crawl_s.crawl(downitem = downitem)


if __name__ == "__main__":
    main()