需要谷歌浏览器,以及相对应的版本的驱动,这是前提条件。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from urllib import error
from urllib import request
import os
import time
import sys
# by jiaxin
# replace for yours
# url = "https://www.google.com"
url = "https://global.bing.com/?mkt=en-us"
explorer = "Chrome"
# directory
imgs_dir = "./BMP-2_images" #图片保存文件夹
downitem = "(印度|india|indian) BMP-2" #搜索关键词,空格是且,|是或
keyname = "BMP-2_"
startnum = 24 #图片命名起始编号
# report hook with three parameters passed
# count_of_blocks The number of blocks transferred
# block_size The size of block
# total_size Total size of the file
def progress_callback(count_of_blocks, block_size, total_size):
# determine current progress
progress = int(50 * (count_of_blocks * block_size) / total_size)
if progress > 50:
progress = 50
# update progress bar
sys.stdout.write("\r[%s%s] %d%%" % ('█' * progress, ' ' * (50 - progress), progress * 2))
sys.stdout.flush()
class CrawlSelenium:
def __init__(self, explorer="Chrome", url="https://www.google.com"):
self.url = url
self.explorer = explorer
def set_loading_strategy(self, strategy="normal"):
self.options = Options()
self.options.add_argument("--edge-webview-switches=--webview-distribution=canary")
self.options.add_argument ('--ignore-certificate-errors')
self.options.add_argument ('--ignore-ssl-errors')
self.options.page_load_strategy = strategy
def crawl(self,downitem):
# instantiate driver according to corresponding explorer
if self.explorer == "Chrome":
self.driver = webdriver.Chrome(options=self.options)
if self.explorer == "Opera":
self.driver = webdriver.Opera(options=self.options)
if self.explorer == "Firefox":
self.driver = webdriver.Firefox(options=self.options)
if self.explorer == "Edge":
self.driver = webdriver.Edge(options=self.options)
# search on google
# navigate to url
self.driver.get(self.url)
time.sleep(15)
# locate input field
search_input = self.driver.find_element(By.NAME, 'q')
time.sleep(3)
# emulate user input and enter to search
webdriver.ActionChains(self.driver).move_to_element(search_input).send_keys(downitem + Keys.ENTER).perform()
time.sleep(3)
# navigate to google image
# find navigation buttons
# self.driver.find_element(By.LINK_TEXT, '国际版').click()
self.driver.find_element(By.LINK_TEXT, '图片').click()
time.sleep(5)
# load more images as many as possible
# scrolling to bottom
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
#
time.sleep(3)
try:
for i in range(10):
# do according to message
# message = self.driver.find_element(By.CSS_SELECTOR, 'div.OuJzKb.Bqq24e').get_attribute('textContent')
# print(message)
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(5)
except Exception as err:
print(err)
# get button
# show_more_button = self.driver.find_element(By.CSS_SELECTOR, "input[value='查看更多图片']")
# 临时注释
try:
self.driver.find_element(By.LINK_TEXT, '查看更多图片').click()
except:
print("找不到更多图片按钮,不是什么错误,只是想多翻点图片")
pass
time.sleep(5)
try:
for i in range(8):
# do according to message
# message = self.driver.find_element(By.CSS_SELECTOR, 'div.OuJzKb.Bqq24e').get_attribute('textContent')
# print(message)
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
time.sleep(5)
except Exception as err:
print(err)
time.sleep(15)
imgs = self.driver.find_elements(By.CSS_SELECTOR, "img.mimg")
print("获取到图片数量",len(imgs))
img_count = startnum
time.sleep(15)
for img in imgs:
try:
# image per second
print('\ndownloading image ' + str(img_count) + ': ')
time.sleep(5)
img_url = img.get_attribute("src")
try:
img.click()
print("打开图片成功,兄弟们稍安勿躁,等15秒让图片加载")
time.sleep(8)
self.driver.switch_to.frame("OverlayIFrame")
time.sleep(5)
jiaxin = self.driver.find_elements(By.CSS_SELECTOR, "img.nofocus")
# url = jiaxin[0].get_attribute("src")
for i in jiaxin:
try:
t= i.get_attribute("src")
if 'bing' in t:
continue
else:
img_url = t
print("老天保佑,原图连接获取成功")
except:
print("获取清晰src的时候出错了")
continue
print("连接:",img_url)
# time.sleep(5)
except:
print("点不开,下载原来的")
pass
if img_url == None:
print("他妈的,居然空的")
continue
path = os.path.join(imgs_dir, str(keyname)+str(img_count) + "_img.jpg")
request.urlretrieve(url = img_url, filename = path, reporthook = progress_callback, data = None)
img_count = img_count + 1
time.sleep(5)
print("\n狗日的,终于下载成功一张")
try:
self.driver.find_element(By.XPATH, "//div[@class='close nofocus']").click()
except:
pass
self.driver.switch_to.parent_frame()
except error.HTTPError as http_err:
print(http_err)
try:
self.driver.find_element(By.XPATH, "//div[@class='close nofocus']").click()
except:
pass
self.driver.switch_to.parent_frame()
except Exception as err:
print(err)
try:
self.driver.find_element(By.XPATH, "//div[@class='close nofocus']").click()
except:
pass
self.driver.switch_to.parent_frame()
def main():
# setting
crawl_s = CrawlSelenium(explorer, url)
crawl_s.set_loading_strategy("normal")
# make directory
if not os.path.exists(imgs_dir):
os.mkdir(imgs_dir)
# crawling
crawl_s.crawl(downitem = downitem)
if __name__ == "__main__":
main()