前提:拥有能够登录系统的用户名和密码
问题:网站有验证码验证
因为某些网站有变动的验证码进行验证,一开始使用opencv库想要自动识别,但是发现识别效果不太好(没学过深度=),遂改用selenium进行登录获取request Headers以及页面响应
selenium 进行登录,手动输入登录信息以及验证码后跳转到自己想要的页面
登录
chrome_options = Options()
#添加proxy参数
chrome_options.add_argument('--proxy-server={0}'.format(proxy.proxy))
#避免w3c
chrome_options.add_experimental_option('w3c', False)
driver = webdriver.Chrome(options=chrome_options)
# 打开登录页面
driver.get("登陆页面的url")
# 页面缓冲
userHand = input("-----在下面人工输入用户名-----\n:")
passswordHand = input("-----在下面人工输入密码-----\n:")
Seccodeverify = input("-----在下面人工输入验证码-----\n:")
time.sleep(3)
# 填写用户名、密码和验证码
# 获取页面元素
username_input = driver.find_element(By.ID, "form_item_name")
password_input = driver.find_element(By.ID, "form_item_password")
captcha_input = driver.find_element(By.ID, "form_item_captcha")
login_btn = driver.find_element(By.CLASS_NAME, "login-btn")
username_input.send_keys(userHand)
password_input.send_keys(passswordHand)
captcha_input.send_keys(Seccodeverify)
# 登录按钮提交
login_btn.click()
访问页面
# 访问调查页面获取抓包数据 得到访问某接口时需要的内置参数
time.sleep(1)
# 你需要抓取的页面的url
driver.get("url")
time.sleep(10)
page_element = driver.page_source
# 你需要点击的页面按钮位置
waitingCheckNumber = driver.find_element(By.XPATH, "//*[text()='']")
checkedNumber = driver.find_element(By.XPATH, "//*[text()='']")
# SubmitNumber = driver.find_element(By.XPATH, "//*[text()='已提交']")
# 在当前session中更新cookies
sessionId = driver.session_id
cookie = {'sessionid': sessionId}
session.cookies.update(cookie)
waitingCheckNumber.click()
time.sleep(1)
checkedNumber.click()
time.sleep(1)
这边不知道有没有必要更新session中的cookie(获取到的是空的),因为学的比较浅就没深究,感觉还是要看网站请求中有没有cookie,有的话还是有必要更新以下的,没有的话就算了
browsermobproxy获取request Headers以及payload等一系列参数
# 创建proxy请求
server = Server(r'E:\pythonProject\browsermob-proxy-2.1.4\bin\browsermob-proxy.bat')
server.start()
proxy = server.create_proxy()
proxy.new_har("turangsanpuadmin", options={'captureHeaders': True, 'captureContent': True})
result = proxy.har
for entry in result['log']['entries']:
_url = entry['request']['url']
payload = urlToDict(_url)
payload['pageSize'] = 50
if '你需要请求的接口api(如果不知道的话可以先看一下result里面有什么)' in _url:
# 修改request Headers
_headers = entry['request']['headers']
for header in _headers:
headers[header['name']] = header['value']
# post请求专用
# _postDatas = json.loads(entry['request']['postData']['text'])
# for postData in _postDatas:
# # 修改pageSize最大化
# postData_value = _postDatas[postData]
# payload[postData] = postData_value
# payload['pageSize'] = 50
# _response = entry['response']
# _content = _response['content']['text']
# # 获取接口返回内容
# # print(_postDatas)
# print(_response)
# print(_content)
break
然后就可以快乐去请求接口数据啦
#这是get请求的
response = session.get(url=target_url, params=payload, headers=headers)
#这是post
#需要注意的是post请求中的参数必须转化成json格式
response = session.post(target_url, json.dumps(payload), headers=headers)
完整代码如图
import base64
import json
import pandas as pd
from selenium.webdriver.common.by import By
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from browsermobproxy import Server
# 分离url
from urllib.parse import urlparse, parse_qs, urlencode
# 创建proxy请求
server = Server(r'E:\pythonProject\browsermob-proxy-2.1.4\bin\browsermob-proxy.bat')
server.start()
proxy = server.create_proxy()
proxy.new_har("turangsanpuadmin", options={'captureHeaders': True, 'captureContent': True})
# 创建session会话
session = requests.Session()
# 创建浏览器驱动程序实例
chrome_options = Options()
chrome_options.add_argument('--proxy-server={0}'.format(proxy.proxy))
chrome_options.add_experimental_option('w3c', False)
driver = webdriver.Chrome(options=chrome_options)
# 打开登录页面
driver.get("url")
# 页面缓冲
userHand = input("-----在下面人工输入用户名-----\n:")
passswordHand = input("-----在下面人工输入密码-----\n:")
Seccodeverify = input("-----在下面人工输入验证码-----\n:")
time.sleep(3)
# 填写用户名、密码和验证码
# 获取页面元素
username_input = driver.find_element(By.ID, "")
password_input = driver.find_element(By.ID, "")
captcha_input = driver.find_element(By.ID, "")
login_btn = driver.find_element(By.CLASS_NAME, "")
username_input.send_keys(userHand)
password_input.send_keys(passswordHand)
captcha_input.send_keys(Seccodeverify)
# 登录按钮提交
login_btn.click()
# 访问接口数据需要用到的初始参数
headers = {}
# 访问调查页面获取抓包数据 得到访问某接口时需要的内置参数
time.sleep(1)
# 你需要抓取的页面的url
driver.get("url")
time.sleep(10)
page_element = driver.page_source
# 你需要点击的页面按钮位置
waitingCheckNumber = driver.find_element(By.XPATH, "//*[text()='']")
checkedNumber = driver.find_element(By.XPATH, "//*[text()='']")
# SubmitNumber = driver.find_element(By.XPATH, "//*[text()='已提交']")
# 在当前session中更新cookies
sessionId = driver.session_id
cookie = {'sessionid': sessionId}
session.cookies.update(cookie)
waitingCheckNumber.click()
time.sleep(1)
checkedNumber.click()
time.sleep(1)
# 配置获取抓包数据,匹配接口
result = proxy.har
for entry in result['log']['entries']:
_url = entry['request']['url']
payload = urlToDict(_url)
payload['pageSize'] = 50
if '/api/ssp/dcyd/xj/yzjsh/page' in _url:
# 修改request Headers
_headers = entry['request']['headers']
for header in _headers:
headers[header['name']] = header['value']
# post请求专用
# _postDatas = json.loads(entry['request']['postData']['text'])
# for postData in _postDatas:
# # 修改pageSize最大化
# postData_value = _postDatas[postData]
# payload[postData] = postData_value
# payload['pageSize'] = 50
# _response = entry['response']
# _content = _response['content']['text']
# # 获取接口返回内容
# # print(_postDatas)
# print(_response)
# print(_content)
break
target_url = 'api'
# 定义拿数据的方法 post
# def getData(target_url, payload, headers):
# records_list_number = 0
# records_list_total = 0
# response = session.post(target_url, json.dumps(payload), headers=headers)
# while response.status_code == 200:
# response_text_dict = json.loads(response.text)
# records_list_number += len(response_text_dict['result']['records'])
# records_list_total = response_text_dict['result']['total']
# print(response_text_dict)
# while records_list_number < records_list_total:
# payload['pageNum'] += 1
# response = session.post(target_url, json.dumps(payload), headers=headers)
# break
# if records_list_number == records_list_total:
# break
# 定义拿数据的方法 get
def getDataGet(target_url, payload, headers):
records_list_number = 0
records_list_total = 0
response = session.get(url=target_url, params=payload, headers=headers)
while response.status_code == 200:
response_text_dict = json.loads(response.text)
records_list_number += len(response_text_dict['result']['records'])
records_list_total = response_text_dict['result']['total']
if records_list_number != 0:
records_list = response_text_dict['result']['records']
for record in records_list:
#具体处理
while records_list_number < records_list_total:
payload['pageNum'] = str(int(payload['pageNum']) + 1)
response = session.get(url=target_url, params=payload, headers=headers)
break
if records_list_number == records_list_total:
driver.quit()
break
getDataGet(target_url, payload, headers)