selenium
-
selenium可以实现功能爬虫。
-
作用:可以实现浏览器自动化的操作。
-
pip install selenium
-
准备浏览器的驱动程序(网上下载)
-
演示程序:
-
from selenium import webdriver from time import sleep #1.创建一款浏览器对象 bro = webdriver.Chrome(executable_path='chromedriver.exe') #2.指定一系列指定形式的行为动作 bro.get('https://www.jd.com') #3.标签定位 search_text = bro.find_element_by_xpath('//*[@id="key"]') search_text.send_keys('mac pro') sleep(2) btn = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button') btn.click() sleep(2) #4.js注入 bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') sleep(3) #5.捕获到当前页面的页面源码数据 page_text = bro.page_source print(page_text) bro.quit()
-
seleniun和爬虫之间的关联
- 便捷的捕获动态加载的数据(可见即可得)
- 便捷的实现模拟登录
-
动态加载数据的捕获
-
from selenium import webdriver from time import sleep from lxml import etree bro = webdriver.Chrome(executable_path='chromedriver.exe') url = 'http://scxk.nmpa.gov.cn:81/xk/' bro.get(url) sleep(1) #列表中默认存放的是第一页的页面源码数据 page_text_list = [bro.page_source] for i in range(5): #点击下一页按钮,然后将下一页的页面源码数据也存储到上面的列表中 bro.find_element_by_xpath('//*[@id="pageIto_next"]').click() sleep(2) page_text_list.append(bro.page_source) #解析数据 for page_text in page_text_list: tree = etree.HTML(page_text) li_list = tree.xpath('//*[@id="gzlist"]/li') for li in li_list: title = li.xpath('./dl/@title')[0] print(title) sleep(1) sleep(3) bro.quit()
-
动作链
-
switch_to的操作:
-
from selenium import webdriver from time import sleep from lxml import etree bro = webdriver.Chrome(executable_path='chromedriver.exe') bro.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') #定位到了拖动的div标签 #注意:如果定位的标签是存在于iframe标签之中的,则需要额外执行如下操作才可以定位指定标签 bro.switch_to.frame('iframeResult') source_div = bro.find_element_by_xpath('//*[@id="draggable"]') print(source_div) sleep(3) bro.quit()
-
动作链:一些列连续的动作
-
from selenium import webdriver from time import sleep from lxml import etree from selenium.webdriver import ActionChains #动作链 bro = webdriver.Chrome(executable_path='chromedriver.exe') bro.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable'') #定位到了拖动的div标签 #注意:如果定位的标签是存在于iframe标签之中的,则需要额外执行如下操作才可以定位指定标签 bro.switch_to.frame('iframeResult') source_div = bro.find_element_by_xpath('//*[@id="draggable"]') print(source_div) sleep(1) #使用动作练完成一些列连续的动作 action = ActionChains(bro) action.click_and_hold(source_div) #点击且长按 for i in range(5): print(i) action.move_by_offset(10,10).perform() #perform立即执行动作链 sleep(0.5) action.release() sleep(3) bro.quit()
-
淘宝滑动案例:
-
from selenium import webdriver from time import sleep from lxml import etree from selenium.webdriver import ActionChains #动作链 bro = webdriver.Chrome(executable_path='chromedriver.exe') bro.get('https://login.taobao.com/member/login.jhtml?spm=a21bo.21814703.754894437.1.5af911d9w1MZbl&f=top&redirectURL=https%3A%2F%2Fwww.taobao.com%2F') sleep(2) bro.find_element_by_xpath('//*[@id="fm-login-id"]').send_keys('649641514@qq.com') sleep(1) bro.find_element_by_xpath('//*[@id="fm-login-password"]').send_keys('fdsfdsfdsfds') sleep(2) source_span = bro.find_element_by_xpath('//*[@id="nc_1_n1z"]') print(source_span) sleep(1) action = ActionChains(bro) action.click_and_hold(source_span) for i in range(5): action.move_by_offset(7,0).perform() sleep(0.5) action.release()
-
-
模拟登录
-
12306模拟登录
-
from selenium import webdriver from time import sleep from PIL import Image #进行图像的基本处理,注意:pip install Pillow from selenium.webdriver import ActionChains #动作链 import base64 import json import requests def base64_api(uname, pwd, img, typeid): with open(img, 'rb') as f: base64_data = base64.b64encode(f.read()) b64 = base64_data.decode() data = {"username": uname, "password": pwd, "typeid": typeid, "image": b64} result = json.loads(requests.post("http://api.ttshitu.com/predict", json=data).text) if result['success']: return result["data"]["result"] else: return result["message"] return "" bro = webdriver.Chrome(executable_path='chromedriver.exe') bro.get('https://kyfw.12306.cn/otn/resources/login.html') sleep(1) bro.find_element_by_xpath('/html/body/div[2]/div[2]/ul/li[2]/a').click() sleep(1) bro.find_element_by_xpath('//*[@id="J-userName"]').send_keys('123@qq.com') sleep(1) bro.find_element_by_xpath('//*[@id="J-password"]').send_keys('123443434343') sleep(1) #截屏 bro.save_screenshot('./main.png') #验证码局部区域的裁剪 #1.获取验证码图片对应的核心坐标 code_img = bro.find_element_by_xpath('//*[@id="J-loginImg"]') location = code_img.location size = code_img.size #rangle中保存的就是验证码图片左下角和右上角两点坐标的四个数值(裁剪区域) rangle = (int(location['x']),int(location['y']),int(location['x']+size['width']),int(location['y']+size['height'])) i = Image.open('./main.png') frame = i.crop(rangle) frame.save('./code.png') #对验证码图片进行识别,返回点击的坐标 img_path = "./code.png" result = base64_api(uname='bb328410948', pwd='bb328410948', img=img_path, typeid=27) print(result) #对打码平台返回的坐标进行点击即可:60,140|249,93 == > [[x,y],[x,y]] all_list = [] if '|' in result: list_1 = result.split('|') count_1 = len(list_1) for i in range(count_1): xy_list = [] x = int(list_1[i].split(',')[0]) y = int(list_1[i].split(',')[1]) xy_list.append(x) xy_list.append(y) all_list.append(xy_list) else: x = int(result.split(',')[0]) y = int(result.split(',')[1]) xy_list = [] xy_list.append(x) xy_list.append(y) all_list.append(xy_list) #基于动作链实现点击验证码 for loc in all_list: x = loc[0] y = loc[1] ActionChains(bro).move_to_element_with_offset(code_img,x,y).click().perform() sleep(1) bro.quit()
-
无头浏览器
-
from selenium import webdriver from selenium.webdriver.chrome.options import Options import time # 创建一个参数对象,用来控制chrome以无界面模式打开 chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') # 驱动路径 # 创建浏览器对象 browser = webdriver.Chrome(executable_path='chromedriver.exe', chrome_options=chrome_options) # 上网 url = 'https://www.baidu.com/' browser.get(url) time.sleep(3) browser.save_screenshot('baidu.png') browser.quit()
-
-
python执行js代码
-
PyExecJS介绍:PyExecJS 是一个可以使用 Python 来模拟运行 JavaScript 的库。我们需要pip install PyExecJS对其进行环境安装。
- 注意:本机必须额外安装nodejs开发环境
-
将调试好的js代码存放到一个js源文件中进行保存
-
import execjs node = execjs.get() #js源文件进行编译 ctx = node.compile(open('test.js',encoding='utf-8').read()) #调用编译好js文件中的指定函数 result = ctx.eval('getPwd("111111")') print(result)
-
整个完整流程参考:https://www.cnblogs.com/bobo-zhang/p/11243138.html
-