from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
import os
import re
import time
class Spider:
url = "https://mm.enterdesk.com/"
directory = "images2"
pages_pattern = '\/([0-9]+?).html'
# 获取网站图片信息
def get_html(self):
browser=webdriver.Chrome()
browser.get("https://mm.enterdesk.com/")
wait=WebDriverWait(browser,10)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,".egeli_pic_m")))
# 获取图片页数
#page = browser.find_element(By.CSS_SELECTOR,".listpages ul li:last-child a")
#pages = re.findall(Spider.pages_pattern,page.get_attribute("href"))
#print(int(pages[0]))
# 根据页数进行下拉操作(下拉6次测验)
for i in range(6):
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
print("第"+str(i+1)+"进行下拉操作")
time.sleep(3)
# 获取图片列表
lst=browser.find_element(By.CSS_SELECTOR,".egeli_pic_m")
lst = lst.find_elements(By.CSS_SELECTOR,".egeli_pic_li dl dd img")
#print(len(lst))
images=[]
for l in lst:
image={'url':l.get_attribute("src"),'title':l.get_attribute("title")}
images.append(image)
return images
# 下载图片
def get_image(self, images):
i = 0 #记录下载的图片数
if not os.path.exists(Spider.directory):
os.makedirs(Spider.directory)
for img in images:
dirc=os.path.join(Spider.directory, img['title']+'.jpg')
# 过滤已存在的图片
if os.path.exists(dirc):
continue
res = requests.get(img['url'])
with open(dirc,'wb') as f:
f.write(res.content)
i += 1
print("本次共保存%s张图片"%i)
# 启动方法
def go(self):
start = time.time()
images = self.get_html()
self.get_image(images)
end = time.time()
print("本次总共耗时:",end-start)
spider = Spider()
spider.go()