最近,因工作需要爬取一个系统的数据。该系统只能用低版本IE浏览器,所以尝试了用360浏览器,考虑能够让更多人扩展该爬虫,选取selenium驱动浏览器。现在把需要用到的主要技术点整理如下,备以后查询。
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
1.selenium打开浏览器,注意设置浏览器的启动路径
options = webdriver.ChromeOptions()
options.binary_location = r"C:\Users\83371\AppData\Roaming\360se6\Application\360se.exe"
#设置chromedriver的启动位置
service = Service(r’D:\Python\Python37\chromedriver.exe’)
drive = webdriver.Chrome(service=service,options=options)
drive.get(‘http://12.12.12.12:80/apm/’)
2.js给textBox赋值
js = “var obj = document.getElementsByName(‘userName’)[0];obj.value=‘cc’;”
drive.execute_script(js)
js = “var obj = document.getElementsByName(‘passWord’)[0];obj.value=‘cc’;”
drive.execute_script(js)
5.xpath定位,send_keys赋值textBox
drive.find_element(by=By.XPATH,value=“//[@name=‘userName’]“).send_keys(“cc”)
drive.find_element(by=By.XPATH,value=”//[@name=‘passWord’]”).send_keys(“cc”)
drive.find_element(by=By.XPATH,value=“//*[@class=‘sureButton’]”).click()
6.xpath定位,根据text确定菜单位置,点击菜单
elements=drive.find_elements(by=By.XPATH,value=“//[@class=‘nav-label’]“)
for el in elements:
if el.text==‘立项管理’:
el.click() #需要把下级菜单展示出来
break
drive.find_element(by=By.XPATH,value=”//[@href=‘app/list’]”).click()
7.xpath定位,切换iframe
drive.switch_to.frame(drive.find_element(by=By.XPATH,value=“//iframe[contains(@src,‘app/list’)]”))
8.js定位,赋值select
js = “var obj = document.getElementsByName(‘pt_category’)[0];obj[1].selected = true;”
drive.execute_script(js)
9.js定位,赋值datepicker
js = “var obj = document.getElementsByName(‘pt_start’)[0];obj.value = ‘2022-03-21’;”
drive.execute_script(js)
drive.find_element(by=By.XPATH,value=‘//*[@name=“pt_start”]’).send_keys(‘2022-03-21’)
101.验证码识别
import ddddocr as ddddocr
ocr = ddddocr.DdddOcr()
with open(‘aa.jpg’, ‘rb’) as f:
img_bytes = f.read()
res = ocr.classification(img_bytes)
print(res)