Python selenium+browsermobproxy获取request Header or Token

前提:拥有能够登录系统的用户名和密码

问题:网站有验证码验证

因为某些网站有变动的验证码进行验证,一开始使用opencv库想要自动识别,但是发现识别效果不太好(没学过深度=),遂改用selenium进行登录获取request Headers以及页面响应

selenium 进行登录,手动输入登录信息以及验证码后跳转到自己想要的页面

登录
chrome_options = Options()
#添加proxy参数
chrome_options.add_argument('--proxy-server={0}'.format(proxy.proxy))
#避免w3c
chrome_options.add_experimental_option('w3c', False)
driver = webdriver.Chrome(options=chrome_options)
# 打开登录页面
driver.get("登陆页面的url")
# 页面缓冲
userHand = input("-----在下面人工输入用户名-----\n:")
passswordHand = input("-----在下面人工输入密码-----\n:")
Seccodeverify = input("-----在下面人工输入验证码-----\n:")
time.sleep(3)
# 填写用户名、密码和验证码
# 获取页面元素
username_input = driver.find_element(By.ID, "form_item_name")
password_input = driver.find_element(By.ID, "form_item_password")
captcha_input = driver.find_element(By.ID, "form_item_captcha")
login_btn = driver.find_element(By.CLASS_NAME, "login-btn")
username_input.send_keys(userHand)
password_input.send_keys(passswordHand)
captcha_input.send_keys(Seccodeverify)
# 登录按钮提交
login_btn.click()
访问页面
# 访问调查页面获取抓包数据 得到访问某接口时需要的内置参数
time.sleep(1)
# 你需要抓取的页面的url
driver.get("url")
time.sleep(10)
page_element = driver.page_source
# 你需要点击的页面按钮位置
waitingCheckNumber = driver.find_element(By.XPATH, "//*[text()='']")
checkedNumber = driver.find_element(By.XPATH, "//*[text()='']")
# SubmitNumber = driver.find_element(By.XPATH, "//*[text()='已提交']")
# 在当前session中更新cookies
sessionId = driver.session_id
cookie = {'sessionid': sessionId}
session.cookies.update(cookie)
waitingCheckNumber.click()
time.sleep(1)
checkedNumber.click()
time.sleep(1)

这边不知道有没有必要更新session中的cookie(获取到的是空的),因为学的比较浅就没深究,感觉还是要看网站请求中有没有cookie,有的话还是有必要更新以下的,没有的话就算了

browsermobproxy获取request Headers以及payload等一系列参数
# 创建proxy请求
server = Server(r'E:\pythonProject\browsermob-proxy-2.1.4\bin\browsermob-proxy.bat')
server.start()
proxy = server.create_proxy()
proxy.new_har("turangsanpuadmin", options={'captureHeaders': True, 'captureContent': True})
result = proxy.har
for entry in result['log']['entries']:
    _url = entry['request']['url']
    payload = urlToDict(_url)
    payload['pageSize'] = 50
    if '你需要请求的接口api(如果不知道的话可以先看一下result里面有什么)' in _url:
        # 修改request Headers
        _headers = entry['request']['headers']
        for header in _headers:
            headers[header['name']] = header['value']
        #     post请求专用
        # _postDatas = json.loads(entry['request']['postData']['text'])
        # for postData in _postDatas:
        #     # 修改pageSize最大化
        #     postData_value = _postDatas[postData]
        #     payload[postData] = postData_value
        #     payload['pageSize'] = 50
        # _response = entry['response']
        # _content = _response['content']['text']
        # # 获取接口返回内容
        # # print(_postDatas)
        # print(_response)
        # print(_content)
        break

然后就可以快乐去请求接口数据啦
#这是get请求的
response = session.get(url=target_url, params=payload, headers=headers)
#这是post
#需要注意的是post请求中的参数必须转化成json格式
response = session.post(target_url, json.dumps(payload), headers=headers)

完整代码如图

import base64
import json
import pandas as pd
from selenium.webdriver.common.by import By
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from browsermobproxy import Server
# 分离url
from urllib.parse import urlparse, parse_qs, urlencode
# 创建proxy请求
server = Server(r'E:\pythonProject\browsermob-proxy-2.1.4\bin\browsermob-proxy.bat')
server.start()
proxy = server.create_proxy()
proxy.new_har("turangsanpuadmin", options={'captureHeaders': True, 'captureContent': True})
# 创建session会话
session = requests.Session()
# 创建浏览器驱动程序实例
chrome_options = Options()
chrome_options.add_argument('--proxy-server={0}'.format(proxy.proxy))
chrome_options.add_experimental_option('w3c', False)
driver = webdriver.Chrome(options=chrome_options)
# 打开登录页面
driver.get("url")
# 页面缓冲
userHand = input("-----在下面人工输入用户名-----\n:")
passswordHand = input("-----在下面人工输入密码-----\n:")
Seccodeverify = input("-----在下面人工输入验证码-----\n:")
time.sleep(3)
# 填写用户名、密码和验证码
# 获取页面元素
username_input = driver.find_element(By.ID, "")
password_input = driver.find_element(By.ID, "")
captcha_input = driver.find_element(By.ID, "")
login_btn = driver.find_element(By.CLASS_NAME, "")
username_input.send_keys(userHand)
password_input.send_keys(passswordHand)
captcha_input.send_keys(Seccodeverify)
# 登录按钮提交
login_btn.click()
# 访问接口数据需要用到的初始参数
headers = {}

# 访问调查页面获取抓包数据 得到访问某接口时需要的内置参数
time.sleep(1)
# 你需要抓取的页面的url
driver.get("url")
time.sleep(10)
page_element = driver.page_source
# 你需要点击的页面按钮位置
waitingCheckNumber = driver.find_element(By.XPATH, "//*[text()='']")
checkedNumber = driver.find_element(By.XPATH, "//*[text()='']")
# SubmitNumber = driver.find_element(By.XPATH, "//*[text()='已提交']")
# 在当前session中更新cookies
sessionId = driver.session_id
cookie = {'sessionid': sessionId}
session.cookies.update(cookie)
waitingCheckNumber.click()
time.sleep(1)
checkedNumber.click()
time.sleep(1)

# 配置获取抓包数据,匹配接口
result = proxy.har
for entry in result['log']['entries']:
    _url = entry['request']['url']
    payload = urlToDict(_url)
    payload['pageSize'] = 50
    if '/api/ssp/dcyd/xj/yzjsh/page' in _url:
        # 修改request Headers
        _headers = entry['request']['headers']
        for header in _headers:
            headers[header['name']] = header['value']
        #     post请求专用
        # _postDatas = json.loads(entry['request']['postData']['text'])
        # for postData in _postDatas:
        #     # 修改pageSize最大化
        #     postData_value = _postDatas[postData]
        #     payload[postData] = postData_value
        #     payload['pageSize'] = 50
        # _response = entry['response']
        # _content = _response['content']['text']
        # # 获取接口返回内容
        # # print(_postDatas)
        # print(_response)
        # print(_content)
        break

target_url = 'api'


# 定义拿数据的方法 post
# def getData(target_url, payload, headers):
#     records_list_number = 0
#     records_list_total = 0
#     response = session.post(target_url, json.dumps(payload), headers=headers)
#     while response.status_code == 200:
#         response_text_dict = json.loads(response.text)
#         records_list_number += len(response_text_dict['result']['records'])
#         records_list_total = response_text_dict['result']['total']
#         print(response_text_dict)
#         while records_list_number < records_list_total:
#             payload['pageNum'] += 1
#             response = session.post(target_url, json.dumps(payload), headers=headers)
#             break
#         if records_list_number == records_list_total:
#             break
# 定义拿数据的方法 get
def getDataGet(target_url, payload, headers):
    records_list_number = 0
    records_list_total = 0
    response = session.get(url=target_url, params=payload, headers=headers)
    while response.status_code == 200:
        response_text_dict = json.loads(response.text)
        records_list_number += len(response_text_dict['result']['records'])
        records_list_total = response_text_dict['result']['total']
        if records_list_number != 0:
            records_list = response_text_dict['result']['records']
            for record in records_list:
                #具体处理
        while records_list_number < records_list_total:
            payload['pageNum'] = str(int(payload['pageNum']) + 1)
            response = session.get(url=target_url, params=payload, headers=headers)
            break
        if records_list_number == records_list_total:
            driver.quit()
            break
getDataGet(target_url, payload, headers)




  • 2
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值