Scrapy + Selenium 实现模拟登录,获取页面动态加载数据

本文是模拟登录某八同城招聘网站,然后获取一个招聘信息页面中的浏览人数。

直接上代码!

1. Scrapy 爬虫文件

import scrapy


class TestSpider(scrapy.Spider):
    name = 'test'
    allowed_domains = ['cs.58.com'] # 允许的域
    # 某一个数据信息的url
    start_urls = ['https://sz.58.com/zpyiyuanyiliao/45641604514186x.shtml']
    
    # 前置请求,访问登录界面,存储cookie
    def start_requests(self):
        log_url = "https://passport.58.com/login/?path=https%3A%2F%2Fcs.58.com%2Fjob.shtml%3Futm_source%3Dmarket%26spm%3Du-2d2yxv86y3v43nkddh1.BDPCPZ_BT%26PGTID%3D0d100000-0019-e329-e4d5-4bb0d0c6e79f%26ClickID%3D4&source=58-homepage-pc&PGTID=0d202408-0019-ed52-7dcd-8033768af605&ClickID=2"
        yield scrapy.Request(url=log_url)
    
    # 登录成功后,发起对页面数据的请求
    def parse(self, response):
        print("发起页面信息请求")
        yield scrapy.Request(url=self.start_urls[0], callback=self.test, dont_filter=True)
   
     # 解析页面数据,打印输出测试结果
    def test(self, response):
        print(response.xpath("//span[@class='pos_base_num pos_base_browser']/i/text()").extract_first())

2. 下载中间件:

只改动了 process_request 方法。

import json
import time

from scrapy import signals
from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By


# 下载中间件
class DemoDownloaderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):
        # 无头模式
        opt = Options()
        opt.add_argument("--headless")
        opt.add_argument("--disable-gpu")
        opt.add_argument("log-level=3") # 设置日志等级
        # opt.add_argument("--proxy-server=162.14.98.117:16817")  # 设置代理
        # 规避检测
        chrome_opt = ChromeOptions()
        chrome_opt.add_experimental_option('excludeSwitches', ['enable-automation', 'enable-logging'])

        driver = webdriver.Chrome(executable_path="F:/pycharm/test/3.动态加载数据处理/chromedriver.exe", options=chrome_opt,
                                  chrome_options=opt)

        # 设置selenium携带cookie
        # 如果是登录请求,存储cookie
        if "login" in request.url:
            driver.get(request.url)
            time.sleep(5)
            # 通过selenium输入账号密码,如果只能扫码登录的话就把睡眠时间调长一点,进行手动扫码登录。
            driver.find_element(By.ID, "mask_body_item_username").send_keys("xxxxx")
            driver.find_element(By.ID, "mask_body_item_newpassword").send_keys("xxxx")
            driver.find_element(By.ID, "mask_body_item_login").click()
            time.sleep(2)
            # 存储cookie
            cookies = driver.get_cookies()
            jsonData = json.dumps(cookies)
            with open("./cookie.json", 'w') as fp:
                fp.write(jsonData)
                print("cookie存储成功")
                return None

        # 正常请求,加载 cookie
        with open("./cookie.json", "r") as fp2:
            list_cookies = json.loads(fp2.read())

        driver.get(request.url)  # 预请求,很重要!不然会报错。
        # cookie的参数可以看存储的json文件
        for cookie in list_cookies:
            cookie_dict = {
                "domain": ".58.com",
                # "expiry": int(cookie.get('expiry')),
                "httpOnly": False,
                "name": cookie.get('name'),
                "path": "/",
                "secure": False,
                "value": cookie.get('value')
            }
            driver.add_cookie(cookie_dict)
        driver.get(request.url)  # 携带cookie的真实请求
        driver.refresh()  # 刷新页面,很重要!
        time.sleep(2)    # 等待页面加载
        data = driver.page_source # 获取页面
        driver.close()
        # 封装返回对象
        response = HtmlResponse(url=request.url, body=data, encoding='utf-8',     request=request)
        return response

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)

注意:运行前,请在 settings 文件中开启 cookie,设置 UA,关闭 robots 协议,启用下载中间件!

效果图: 

问题一:在添加 cookie 时,报错:selenium.common.exceptions.InvalidArgumentException: Message: invalid argument: invalid ‘expiry’

解决:我直接注释掉了这个参数,然后就正常了。

问题二:添加 cookie 参数后出现报错:selenium.common.exceptions.InvalidCookieDomainException: Message: invalid cookie domain

解决:没有在添加 cookie 前发送预请求,看我代码,加上就可以正常访问了。

  • 2
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

月亮给我抄代码

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值