需要谷歌浏览器和对应版本的驱动。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import urllib.request
from urllib import error
from urllib import request
import os
import time
import sys
#========================jiaxin===============================
# default url
# replace for yours
url = "https://www.google.com"
# url = "https://www.bing.com/"
explorer = "Chrome"
# directory
imgs_dir = "./Flag" #图片保存文件夹
downitem = "(印度|india|indian) Army Commander Flag" #搜索关键词,空格是且,|是或
keyname = "Flag_"
startnum = 0 #图片命名起始编号
def downimg(url,path):
headers = {"User-Agent": "Chrome"}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
img = response.read()
with open(path, "wb") as f:
f.write(img)
print("狗日的,下载成功一张图片")
return 1
class CrawlSelenium:
def __init__(self, explorer="Chrome", url="https://www.google.com"):
self.url = url
self.explorer = explorer
def set_loading_strategy(self, strategy="normal"):
self.options = Options()
self.options.add_argument ('--ignore-certificate-errors')
self.options.add_argument ('--ignore-ssl-errors')
self.options.add_argument("--enable-blink-features=PaintTiming")
self.options.page_load_strategy = strategy
# self.options.accept_insecure_certs = True
def crawl(self,downitem):
# instantiate driver according to corresponding explorer
if self.explorer == "Chrome":
self.driver = webdriver.Chrome(options=self.options)
if self.explorer == "Opera":
self.driver = webdriver.Opera(options=self.options)
if self.explorer == "Firefox":
self.driver = webdriver.Firefox(options=self.options)
if self.explorer == "Edge":
self.driver = webdriver.Edge(options=self.options)
# search on google
# navigate to url
self.driver.get(self.url)
time.sleep(5)
# locate input field
search_input = self.driver.find_element(By.NAME, 'q')
# emulate user input and enter to search
webdriver.ActionChains(self.driver).move_to_element(search_input).send_keys(downitem + Keys.ENTER).perform()
time.sleep(5)
# navigate to google image
# find navigation buttons
self.driver.find_element(By.LINK_TEXT, '图片').click()
time.sleep(5)
# load more images as many as possible
# scrolling to bottom
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(5)
# get button
show_more_button = self.driver.find_element(By.CSS_SELECTOR, "input[value='显示更多搜索结果']")
time.sleep(5)
try:
while True:
# do according to message
message = self.driver.find_element(By.CSS_SELECTOR, 'div.OuJzKb.Bqq24e').get_attribute('textContent')
# print(message)
if message == '正在加载更多内容,请稍候':
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
elif message == '新内容已成功加载。向下滚动即可查看更多内容。':
# scrolling to bottom
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
if show_more_button.is_displayed():
show_more_button.click()
elif message == '看来您已经看完了所有内容':
break
elif message == '无法加载更多内容,点击即可重试。':
show_more_button.click()
else:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
except Exception as err:
print(err)
# find all image elements in google image result page
imgs = self.driver.find_elements(By.CSS_SELECTOR, "img.rg_i.Q4LuWd")
print("获取到图片数量",len(imgs))
time.sleep(10)
self.driver.execute_script("window.scrollTo(0, 0);")
time.sleep(5)
img_count = startnum
for index, img in enumerate(imgs):
# for img in imgs:
try:
# image per second
print('\ndownloading image ' + str(img_count) + ': ')
img_url = img.get_attribute("src")
src = 0
try:
img.click()
print("打开图片成功,兄弟们稍安勿躁,等15秒让图片加载")
for i in range(10):
try:
jiaxin = self.driver.find_elements(By.CSS_SELECTOR, "img.r48jcc.pT0Scc.iPVvYb")
src= jiaxin[0].get_attribute("src")
if "data:image" not in src and src != 0:
img_url = src
print("==========原图连接获取成功, 进行更改==========")
break
else:
print("网址没有更新,再等4秒", i)
time.sleep(4)
# jiaxin = self.driver.find_elements(By.CSS_SELECTOR, "img.r48jcc.pT0Scc.iPVvYb")
except:
print("页面没有刷新,再等待5秒", i)
time.sleep(5)
print("连接:",img_url)
# time.sleep(5)
except:
print("click点不开,下载原来的")
pass
if img_url == None:
print("他妈的,居然空的")
continue
path = os.path.join(imgs_dir, str(keyname)+str(img_count) + "_img.jpg")
# request.urlretrieve(url = img_url, filename = path, reporthook = progress_callback, data = None)
su = downimg(url=img_url,path=path)
if su == 1:
img_count = img_count + 1
su = 0
# print("狗日的,终于下载成功一张")
except error.HTTPError as http_err:
print(http_err)
except Exception as err:
print(err)
def main():
# setting
crawl_s = CrawlSelenium(explorer, url)
crawl_s.set_loading_strategy("normal")
# make directory
if not os.path.exists(imgs_dir):
os.mkdir(imgs_dir)
# crawling
crawl_s.crawl(downitem = downitem)
if __name__ == "__main__":
main()