selenium抓包抓取实例,scrapy中间件

from selenium import webdriver
import time
import json
from scrapy.http import HtmlResponse
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
class SeleniumDownloaderMiddleware:
    def __init__(self):
        # self.driver = webdriver.Chrome(executable_path=r'D:/tool/chromedriver.exe')
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_experimental_option('w3c', False)
        chrome_options.add_argument('--headless')
        caps = {
            'browserName': 'chrome',
            'loggingPrefs': {
                'browser': 'ALL',
                'driver': 'ALL',
                'performance': 'ALL',
            },
            'goog:chromeOptions': {
                'perfLoggingPrefs': {
                    'enableNetwork': True,
                },
                'w3c': False,
            },
        }
        self.driver = webdriver.Chrome(executable_path=r'D:/tool/chromedriver.exe',options=chrome_options, desired_capabilities = caps)
    def process_request(self,request,spider):
        self.driver.get(url=request.url)
        time.sleep(3)
        try:
            self.driver.find_element("xpath", '//section//div[@class="_1piuevz"]').click()
        except:
            pass
        time.sleep(1)
        request_log = self.driver.get_log('performance')
        # print(request_log)

        print("len(request_log)", len(request_log))

        # print(request_log)
        for i in range(len(request_log)):
            message = json.loads(request_log[i]['message'])
            # print(message)
            message = message['message']['params']
            # print(message)
            # .get() 方式获取是了避免字段不存在时报错
            request = message.get('request')
            # print(request)
            if (request is None):
                continue
            url = request.get('url')
            # print(url)
            if ("PdpAvailabilityCalendar" in url):
                # 得到requestId
                # print(message['requestId'])
                # 通过requestId获取接口内容
                content = self.driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': message['requestId']})
                print(content)
                break

        # content = self.driver.page_source
        # json_data = self.driver.execute_script("return JSON.stringify(window.PdpAvailabilityCalendar)")
        #
        # # 将 JSON 数据转换为 Python 对象
        # parsed_json = json.loads(json_data)
        #
        # # 输出 Python 对象
        # print(parsed_json)
        #
        # return HtmlResponse(url=request.url,body=content,request=request,encoding='utf-8')

    def __del__(self):
        self.driver.close()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

cb1101

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值