学习自《python3网络爬虫开发实战》, 整理以备日后回顾
requests库的话是第三方库,需要安装
pycharm 第三方库安装流程 File —> settings —> Project —> Interpreter —> + —> install
个人习惯用 Chrome, webdriver 下载地址 ,注意版本对应,不然会用不了报错
- 一些基础知识
# 访问页面
from selenium import webdriver
import time
browser = webdriver.Chrome()
browser.get('https://www.taobao.com')
input_first = browser.find_element_by_id('q')
# 节点查找 进行交互
input = browser.find_element_by_xpath('//*[@id="q"]')
input.send_keys('hfox')
time.sleep(1)
input.clear()
input.send_keys('hhfox')
button = browser.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button')
button.click()
browser.close()
查找节点的方法有很多,这单只记录本人常用的函数
- 动作链
# 切换节点
# browser.switch_to.frame('iframeResult')
# browser.switch_to.parent_frame()
from selenium import webdriver
from selenium.webdriver import ActionChains
import time
browser = webdriver.Chrome()
browser.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
browser.switch_to.frame('iframeResult')
source = browser.find_element_by_xpath('//*[@id="draggable"]')
target = browser.find_element_by_xpath('//*[@id="droppable"]')
action = ActionChains(browser)
action.drag_and_drop(source, target)
action.perform()
- 执行 JavaScript
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
browser.execute_script('alert("surprise")')
- 获取节点属性 内容 相对位置等
from selenium import webdriver
browser = webdriver.Chrome()
url = 'https://www.zhihu.com/explore'
browser.get(url)
artical = browser.find_element_by_xpath('//*[@id="special"]/div[2]/div/div[2]/div[1]/div[1]/a')
print(artical.get_attribute('class'))
print(artical.text)
print(artical.id)
print(artical.location)
print(artical.size)
- 等待
# 隐式等待
from selenium import webdriver
browser = webdriver.Chrome()
browser.implicitly_wait(10)
browser.get('https://www.baidu.com')
browser.find_element_by_id('q')
# 显式等待
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
browser = webdriver.Chrome()
browser.get('https://www.baidu.com')
wait = WebDriverWait(browser, 10)
wait.until(lambda driver: driver.find_element_by_id('q'))
- cookies
from selenium import webdriver
browser = webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
print(browser.get_cookies())
browser.add_cookie({'name': 'hfox', 'value': 'germey', 'domain': 'www.zhihu.com'})
print(browser.get_cookies())
browser.delete_all_cookies()
print(browser.get_cookies())
- 选项卡相关
# 选项卡管理 新标签页
from selenium import webdriver
import time
browser = webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
browser.execute_script('window.open()')
browser.switch_to.window(browser.window_handles[0])
browser.get('https://www.baidu.com')
time.sleep(2)
browser.switch_to.window(browser.window_handles[1])
browser.get('https://taobao.com')
browser.close()
browser.switch_to.window(browser.window_handles[0])
browser.close()
# 前进后退
browser.back()
browser.forward()
- 异常处理
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
browser = webdriver.Chrome()
try:
browser.get('https://www.baidu.com')
except TimeoutException:
print('time out')
try:
browser.find_element_by_id('q')
except NoSuchElementException:
print('no such element')
finally:
browser.close()