from selenium import webdriver
import time
import json
from scrapy.http import HtmlResponse
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
class SeleniumDownloaderMiddleware:
def __init__(self):
# self.driver = webdriver.Chrome(executable_path=r'D:/tool/chromedriver.exe')
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('w3c', False)
chrome_options.add_argument('--headless')
caps = {
'browserName': 'chrome',
'loggingPrefs': {
'browser': 'ALL',
'driver': 'ALL',
'performance': 'ALL',
},
'goog:chromeOptions': {
'perfLoggingPrefs': {
'enableNetwork': True,
},
'w3c': False,
},
}
self.driver = webdriver.Chrome(executable_path=r'D:/tool/chromedriver.exe',options=chrome_options, desired_capabilities = caps)
def process_request(self,request,spider):
self.driver.get(url=request.url)
time.sleep(3)
try:
self.driver.find_element("xpath", '//section//div[@class="_1piuevz"]').click()
except:
pass
time.sleep(1)
request_log = self.driver.get_log('performance')
# print(request_log)
print("len(request_log)", len(request_log))
# print(request_log)
for i in range(len(request_log)):
message = json.loads(request_log[i]['message'])
# print(message)
message = message['message']['params']
# print(message)
# .get() 方式获取是了避免字段不存在时报错
request = message.get('request')
# print(request)
if (request is None):
continue
url = request.get('url')
# print(url)
if ("PdpAvailabilityCalendar" in url):
# 得到requestId
# print(message['requestId'])
# 通过requestId获取接口内容
content = self.driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': message['requestId']})
print(content)
break
# content = self.driver.page_source
# json_data = self.driver.execute_script("return JSON.stringify(window.PdpAvailabilityCalendar)")
#
# # 将 JSON 数据转换为 Python 对象
# parsed_json = json.loads(json_data)
#
# # 输出 Python 对象
# print(parsed_json)
#
# return HtmlResponse(url=request.url,body=content,request=request,encoding='utf-8')
def __del__(self):
self.driver.close()
selenium抓包抓取实例,scrapy中间件
于 2023-03-10 17:20:55 首次发布