selenium在爬虫中的基本应用

最新推荐文章于 2024-07-25 14:09:24 发布

Aaron_liu1

最新推荐文章于 2024-07-25 14:09:24 发布

阅读量301

点赞数 1

文章标签： selenium python xpath

本文链接：https://blog.csdn.net/aaron_liu1/article/details/107132649

版权

基本操作示例

from selenium import webdriver
# 设置驱动的路径，也可将驱动放在程序当前路径下
driver_path=r"path"
# 调用浏览器对象
driver=webdriver.Chrome(executable_path=driver_path)
# 访问url
driver.get("https://www.baidu.com/")
# 获取页面名为wrapper的id标签的文本内容
data=driver.find_element_by_id("wrapper").text
print(data)
# 生成页面快照并保存
driver.save_screenshot("baidu.png")
# 在搜索框中(id="kw")输入关键字"python"
driver.find_element_by_id("kw").send_keys("python")
# 点击搜索按钮(id="su")
driver.find_element_by_id("su").click()
# 打印网页渲染后的源代码
print(driver.page_source)
# 获取当前页面的cookie
cookie=driver.get_cookies()
# 获取当前页面的url
url=driver.current_url
# 关闭页面
driver.close()
# 关闭浏览器
driver.quit()

元素定位的方法

# 通过id属性定位
element = driver.find_element_by_id("passwd-id")
# 通过name属性定位
element = driver.find_element_by_name("user-name")
# 通过标签名定位
element = driver.find_elements_by_tag_name("input")
# 通过XPath来匹配
element = driver.find_element_by_xpath("//input[@id='passwd-id']")

设置等待方式

显式等待：
# 定位等待条件，与元素定位的方式相同
from selenium.webdriver.common.by import By
# WebDriverWait库负责循环等待
from selenium.webdriver.support.ui import WebDriverWait
# 条件出发库
from selenium.webdriver.support import expected_conditions as EC

WebDriverWait(driver,timeout=10).until(
		EC.presence_of_element_located((By.ID,"njn"))
	)
隐式等待：
driver.implicitly_wait(10)  #10s
driver.get("")
element=driver.find_element_by_id("nsfbc")

设置无头模式与开发者模式

from selenium.webdriver.chrome.options import Options

options = Options()
# 设置成无界面模式
options.add_argument("--headless")
options.add_argument("--disable-gpu")
# 不加载图片,加快访问速度
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
# 设置为开发者模式，防止被网站识别出来使用了Selenium
options.add_experimental_option('excludeSwitches',['enable-automation'])

行为链

from selenium.webdriver.common.action_chains import ActionChains

cboxTag=driver.find_element_by_xpath("//*[@id='cboxClose']")
keywordTag=driver.find_element_by_id("keyword").send_keys("python")
# 实例化一个行为链
actions=ActionChains(driver)
# 将鼠标移动到指定标签的位置
actions.move_to_element(cboxTag)
# 点击
actions.click(cboxTag)
# 在指定标签的位置输入
actions.send_keys_to_element(keywordTag,"python")
# 执行行为链
actions.perform()

以拉勾网为例应用selenium登录后爬取指定地点及指定关键字的职位信息

应用案例

# -*- coding: utf-8 -*-
# @Software:PyCharm
# @File:LG_spider.py
# @Author:Aaron—Liu
# @Time:2020/07.03/17:46

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import etree
import os
import csv
import time

driver=webdriver.Chrome()

def login():
	"""微信二维码登录"""
	url="https://www.lagou.com"
	driver.get(url)
	# 叉掉地址选项
	driver.find_element_by_xpath("//*[@id='cboxClose']").click()
	# driver.find_element_by_class_name("login").click()
	time.sleep(5)
	# 点击登录按钮到达登录页面
	driver.find_element_by_xpath("//div[@class='lg_tbar_r']/ul/li[3]/a").click()
	# 切换至二维码登录
	WebDriverWait(driver,3).until(
		EC.presence_of_element_located((By.XPATH,"//div[3]/div[1]/div/div/div[2]/div[1]"))
	)
	driver.find_element_by_xpath("//div[3]/div[1]/div/div/div[2]/div[1]").click()
	# 点击搜索按钮切换至可选城市的页面
	time.sleep(10)
	driver.find_element_by_id("search_button").click()
	# 叉掉广告页面
	WebDriverWait(driver, 10).until(
		EC.presence_of_element_located((By.CLASS_NAME, "body-btn"))
	)
	driver.find_element_by_class_name("body-btn").click()
	# 等待至城市选项出现再点击需要爬取的城市（西安）
	WebDriverWait(driver, 10).until(
		EC.presence_of_element_located((By.XPATH, "//*[@id='filterCollapse']/div[1]/div[2]/li/div[2]/div/a[9]"))
	)
	driver.find_element_by_xpath("//*[@id='filterCollapse']/div[1]/div[2]/li/div[2]/div/a[9]").click()
	# 等至搜索框出现后输入需要爬取的职位信息
	WebDriverWait(driver, 10).until(
		EC.presence_of_element_located((By.ID, "keyword"))
	)
	driver.find_element_by_id("keyword").send_keys("python")
	time.sleep(3)
	driver.find_element_by_id(("submit")).click()

def get_link_list(source):
	"""获取每个职位的url"""
	page_html=etree.HTML(source)
	# 定位职位url
	job_links=page_html.xpath("//a[@class='position_link']/@href")
	# 遍历每页的url
	for job_link in job_links:
		# 获取职位信息
		get_detai_source(job_link)


def get_detai_source(job_link):
	"""
	得到职位详情页面的源代码
	:param job_link: 每个职位的链接
	:return:
	"""
	# 执行js代码打开职位详情页
	driver.execute_script(f"window.open('{job_link}')")
	# 移动窗口到打开的详情页
	driver.switch_to.window(driver.window_handles[1])
	# 等待所有详情页面刷出
	WebDriverWait(driver, 5).until(
		EC.presence_of_element_located((By.XPATH, "//*[@id='job_detail']/dd[4]/div/div/a/span[1]"))
	)
	# 获取详情页的源代码
	detail_source = driver.page_source
	# 提取数据
	parse(detail_source)
	# 关闭当前页
	driver.close()
	# 移动到当前页上一页(职位列表页)
	driver.switch_to.window((driver.window_handles[0]))

def parse(detail_source):
	"""
	提取需要的数据
	:param detail_source: 详情页源码
	:return:
	"""
	detail_html=etree.HTML(detail_source)
	company=''.join(detail_html.xpath("//div[@class='position-head']/div/div[1]//h4/text()"))
	name=''.join(detail_html.xpath("//div[@class='position-head']/div/div[1]/div/@title"))
	salary=''.join(detail_html.xpath("//div[@class='position-head']/div/div[1]//dd//span[1]/text()"))
	education=''.join(detail_html.xpath("//div[@class='position-head']/div/div[1]//dd//span[4]/text()")).replace('/','')
	job_detail=''.join(detail_html.xpath("//*[@id='job_detail']/dd[2]/div//text()")).replace('\n','').replace('\xa0','').replace(' ','')
	address=''.join(detail_html.xpath("//*[@id='job_detail']/dd[3]/div[1]/a[2]/text()"))
	data=[company,name,salary,education,job_detail,address]
	save_data(data)
	print(data)

def save_data(data):
	"""保存所有数据"""
	if os.path.exists("lagou.csv"):
		with open("lagou.csv",'a+',encoding='utf-8',newline='') as f:
			writer=csv.writer(f)
			writer.writerow(data)
	else:
		with open("lagou.csv","w+",encoding='utf-8',newline='') as f:
			writer=csv.writer(f)
			writer.writerow(['公司','职位','薪水','教育','详情','地址'])
			writer.writerow(data)


def main():
	login()
	# 循环获取所有数据
	while 1:
		source = driver.page_source
		get_link_list(source)
		# 显示等待：等待5s直到下一页的标签出现
		WebDriverWait(driver, 5).until(
			EC.presence_of_element_located((By.XPATH, "//div[@class='pager_container']/span[last()]"))
		)
		try:
			# 点击进入下一页
			next_btn = driver.find_element_by_xpath("//div[@class='pager_container']/span[last()]")
			# 判断如果是最后一页，则退出无限循环
			if "pager_next_disabled" in next_btn.get_attribute('class'):
				break
			else:
				next_btn.click()
		except:
			print(source)

if __name__=='__main__':
	main()