为什么要用selenium去爬取?
1、用requests爬取时,详情页面的数据信息总是难以动态获取(如有更好的方法请多多指教)
2、学习selenium
好了,不多说,上代码
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time
import requests
from hashlib import md5
import os
URL = 'https://www.toutiao.com'
class TouTiaoImg():
def __init__(self):
self.browser = webdriver.Chrome()
self.wait = WebDriverWait(self.browser, 10)
self.url = URL
#打开首页,并搜索关键字
def first_step(self, keyword):
try:
self.browser.get(self.url)
inputs = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'tt-input__inner'))) #搜索输入框
button = self.wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.tt-input-group__append button'))) #搜索按钮
inputs.send_keys(keyword)
button.click()
time.sleep(2)
self.browser.switch_to_window(self.browser.window_handles[1]) #让程序切换到列表页
time.sleep(2)
self.second_step()
except TimeoutException:
self.first_step()
finally:
self.browser.close()
#获取列表
def second_step(self):
box = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.feedBox div .sections')))
i = 0
while True:
self.browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
try:
imgItem = box.find_element_by_id('J_section_'+str(i))
except NoSuchElementException:
break
try:
imgItem.find_element_by_css_selector('div div .lbox')
flag = True
except NoSuchElementException:
flag = False
ele = imgItem.find_element_by_css_selector('div div .normal div div a')
title = ele.find_element_by_css_selector('span').text
ele.click() #点击进入选定的详情页
time.sleep(2)
# print(self.browser.window_handles)
self.browser.switch_to_window(self.browser.window_handles[2]) #切换到详情页
self.third_step(flag, title)
self.browser.execute_script('window.close()') #关闭详情页
# time.sleep(2)
self.browser.switch_to_window(self.browser.window_handles[1]) #切换到列表页
i += 1
#获取详情页面内的图片
def third_step(self, flag, title):
print(title)
# print(flag)
try:
if flag: #相册样式展示
article = self.wait.until(EC.presence_of_element_located((By.TAG_NAME, 'article')))
imgItems = article.find_elements_by_tag_name('img')
# print(imgItems)
for imgItem in imgItems:
url = imgItem.get_attribute('src')
print(url)
self.save_img(url, title)
else: #文章类型展示
imgBox = self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'image-list')))
imgItems = imgBox.find_elements_by_class_name('image-item')
for imgItem in imgItems:
url = imgItem.find_element_by_css_selector('div a').get_attribute('href')
print(url)
self.save_img(url, title)
except TimeoutException: #可能会遇到视频,所以报超时异常
print('NO IMAGE')
print("\n")
#保存图片到本地
def save_img(self, url, dir):
dirPath = 'C:/toutiaoImage/' + dir
if not os.path.exists(dirPath):
os.mkdir(dirPath)
response = requests.get(url)
save_name = dirPath + '/{}.jpg'.format(md5(response.content).hexdigest())
if not os.path.exists(save_name):
with open(save_name, 'wb') as f:
f.write(response.content)
if __name__ == '__main__':
tou = TouTiaoImg()
tou.first_step('cosplay') #搜索关键字,随你更改