- 基本操作示例
from selenium import webdriver
# 设置驱动的路径,也可将驱动放在程序当前路径下
driver_path=r"path"
# 调用浏览器对象
driver=webdriver.Chrome(executable_path=driver_path)
# 访问url
driver.get("https://www.baidu.com/")
# 获取页面名为wrapper的id标签的文本内容
data=driver.find_element_by_id("wrapper").text
print(data)
# 生成页面快照并保存
driver.save_screenshot("baidu.png")
# 在搜索框中(id="kw")输入关键字"python"
driver.find_element_by_id("kw").send_keys("python")
# 点击搜索按钮(id="su")
driver.find_element_by_id("su").click()
# 打印网页渲染后的源代码
print(driver.page_source)
# 获取当前页面的cookie
cookie=driver.get_cookies()
# 获取当前页面的url
url=driver.current_url
# 关闭页面
driver.close()
# 关闭浏览器
driver.quit()
- 元素定位的方法
# 通过id属性定位
element = driver.find_element_by_id("passwd-id")
# 通过name属性定位
element = driver.find_element_by_name("user-name")
# 通过标签名定位
element = driver.find_elements_by_tag_name("input")
# 通过XPath来匹配
element = driver.find_element_by_xpath("//input[@id='passwd-id']")
- 设置等待方式
显式等待:
# 定位等待条件,与元素定位的方式相同
from selenium.webdriver.common.by import By
# WebDriverWait库负责循环等待
from selenium.webdriver.support.ui import WebDriverWait
# 条件出发库
from selenium.webdriver.support import expected_conditions as EC
WebDriverWait(driver,timeout=10).until(
EC.presence_of_element_located((By.ID,"njn"))
)
隐式等待:
driver.implicitly_wait(10) #10s
driver.get("")
element=driver.find_element_by_id("nsfbc")
- 设置无头模式与开发者模式
from selenium.webdriver.chrome.options import Options
options = Options()
# 设置成无界面模式
options.add_argument("--headless")
options.add_argument("--disable-gpu")
# 不加载图片,加快访问速度
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
# 设置为开发者模式,防止被网站识别出来使用了Selenium
options.add_experimental_option('excludeSwitches',['enable-automation'])
- 行为链
from selenium.webdriver.common.action_chains import ActionChains
cboxTag=driver.find_element_by_xpath("//*[@id='cboxClose']")
keywordTag=driver.find_element_by_id("keyword").send_keys("python")
# 实例化一个行为链
actions=ActionChains(driver)
# 将鼠标移动到指定标签的位置
actions.move_to_element(cboxTag)
# 点击
actions.click(cboxTag)
# 在指定标签的位置输入
actions.send_keys_to_element(keywordTag,"python")
# 执行行为链
actions.perform()
以拉勾网为例应用selenium登录后爬取指定地点及指定关键字的职位信息
- 应用案例
# -*- coding: utf-8 -*-
# @Software:PyCharm
# @File:LG_spider.py
# @Author:Aaron—Liu
# @Time:2020/07.03/17:46
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import etree
import os
import csv
import time
driver=webdriver.Chrome()
def login():
"""微信二维码登录"""
url="https://www.lagou.com"
driver.get(url)
# 叉掉地址选项
driver.find_element_by_xpath("//*[@id='cboxClose']").click()
# driver.find_element_by_class_name("login").click()
time.sleep(5)
# 点击登录按钮到达登录页面
driver.find_element_by_xpath("//div[@class='lg_tbar_r']/ul/li[3]/a").click()
# 切换至二维码登录
WebDriverWait(driver,3).until(
EC.presence_of_element_located((By.XPATH,"//div[3]/div[1]/div/div/div[2]/div[1]"))
)
driver.find_element_by_xpath("//div[3]/div[1]/div/div/div[2]/div[1]").click()
# 点击搜索按钮切换至可选城市的页面
time.sleep(10)
driver.find_element_by_id("search_button").click()
# 叉掉广告页面
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "body-btn"))
)
driver.find_element_by_class_name("body-btn").click()
# 等待至城市选项出现再点击需要爬取的城市(西安)
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, "//*[@id='filterCollapse']/div[1]/div[2]/li/div[2]/div/a[9]"))
)
driver.find_element_by_xpath("//*[@id='filterCollapse']/div[1]/div[2]/li/div[2]/div/a[9]").click()
# 等至搜索框出现后输入需要爬取的职位信息
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "keyword"))
)
driver.find_element_by_id("keyword").send_keys("python")
time.sleep(3)
driver.find_element_by_id(("submit")).click()
def get_link_list(source):
"""获取每个职位的url"""
page_html=etree.HTML(source)
# 定位职位url
job_links=page_html.xpath("//a[@class='position_link']/@href")
# 遍历每页的url
for job_link in job_links:
# 获取职位信息
get_detai_source(job_link)
def get_detai_source(job_link):
"""
得到职位详情页面的源代码
:param job_link: 每个职位的链接
:return:
"""
# 执行js代码打开职位详情页
driver.execute_script(f"window.open('{job_link}')")
# 移动窗口到打开的详情页
driver.switch_to.window(driver.window_handles[1])
# 等待所有详情页面刷出
WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.XPATH, "//*[@id='job_detail']/dd[4]/div/div/a/span[1]"))
)
# 获取详情页的源代码
detail_source = driver.page_source
# 提取数据
parse(detail_source)
# 关闭当前页
driver.close()
# 移动到当前页上一页(职位列表页)
driver.switch_to.window((driver.window_handles[0]))
def parse(detail_source):
"""
提取需要的数据
:param detail_source: 详情页源码
:return:
"""
detail_html=etree.HTML(detail_source)
company=''.join(detail_html.xpath("//div[@class='position-head']/div/div[1]//h4/text()"))
name=''.join(detail_html.xpath("//div[@class='position-head']/div/div[1]/div/@title"))
salary=''.join(detail_html.xpath("//div[@class='position-head']/div/div[1]//dd//span[1]/text()"))
education=''.join(detail_html.xpath("//div[@class='position-head']/div/div[1]//dd//span[4]/text()")).replace('/','')
job_detail=''.join(detail_html.xpath("//*[@id='job_detail']/dd[2]/div//text()")).replace('\n','').replace('\xa0','').replace(' ','')
address=''.join(detail_html.xpath("//*[@id='job_detail']/dd[3]/div[1]/a[2]/text()"))
data=[company,name,salary,education,job_detail,address]
save_data(data)
print(data)
def save_data(data):
"""保存所有数据"""
if os.path.exists("lagou.csv"):
with open("lagou.csv",'a+',encoding='utf-8',newline='') as f:
writer=csv.writer(f)
writer.writerow(data)
else:
with open("lagou.csv","w+",encoding='utf-8',newline='') as f:
writer=csv.writer(f)
writer.writerow(['公司','职位','薪水','教育','详情','地址'])
writer.writerow(data)
def main():
login()
# 循环获取所有数据
while 1:
source = driver.page_source
get_link_list(source)
# 显示等待:等待5s直到下一页的标签出现
WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.XPATH, "//div[@class='pager_container']/span[last()]"))
)
try:
# 点击进入下一页
next_btn = driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
# 判断如果是最后一页,则退出无限循环
if "pager_next_disabled" in next_btn.get_attribute('class'):
break
else:
next_btn.click()
except:
print(source)
if __name__=='__main__':
main()