Scrapy结合Selenuim

middlewares部分(切记setting里打开该中间件)
前置步骤都放入spider_opened(在from_crawler里连接好)

import random
import logging
from scrapy import signals
from selenium.webdriver import Chrome
from scrapy.http import HtmlResponse
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter

logger = logging.getLogger(__name__)


class SeleniumDownloadMiddleware:
    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
        return s

    def spider_opened(self):
        chrome_options = Options()
        prefs = {"profile.managed_default_content_settings.images": 2}
        chrome_options.add_experimental_option("prefs", prefs)
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        self.driver = Chrome(chrome_options=chrome_options)
        self.driver.maximize_window()

    def spider_closed(self):
        self.driver.quit()

    def process_request(self, request, spider):
        if request.meta.get('item') and request.meta.get('item').get('type'):
            if request.meta.get('item').get('new_page'):
                self.driver.get(request.url)
                try:
                    js = "document.getElementById('bitAd_floatImage_max').style.display='none';"
                    self.driver.execute_script(js)
                except:
                    pass
            loc = ('css selector',
                   f'div.search-result-list>div[data-id=\'{request.meta["item"]["data_id"]}\']>p.cx-ck-count.text-hover>i')
            try:
                ele2 = WebDriverWait(self.driver, 6, 0.1).until(EC.presence_of_element_located(loc))
            except:
                # 防止第一次执行selenium时未进入指定网页
                self.driver.get(request.url)
                ele2 = WebDriverWait(self.driver, 6, 0.1).until(EC.presence_of_element_located(loc))
            self.driver.execute_script("arguments[0].click();", ele2)
            type_loc = ('css selector', 'span.ck-cx-list-wrapper>div.ck-cx-list-content>a>div>div')
            WebDriverWait(self.driver, 10, 0.1).until(EC.presence_of_element_located(type_loc)).text  # 更新页面元素的操作
            html = self.driver.page_source
            self.driver.execute_script("arguments[0].click();", ele2)  # 复原页面
            return HtmlResponse(request.url, body=html, encoding='utf8', request=request)

spider部分
进入下一个回调函数前,会进入刚才的中间件,执行process_request,selenium操作结束return response给spider,如果spider内有多个回调函数需要执行时,需要在selenium中间件里判断,否则每进入回调函数之前都会进入selenium中间件

class CarSpider(scrapy.Spider):
    name = 'car'
    allowed_domains = ['xxxxxxx']
    start_urls = ['xxxxxxx']

    def parse(self, response):
        item = YichewangItem()
        alpha_list = response.css('div.brand-list>div')
        sort=1
        for alpha in alpha_list:
            item['sort']=sort
            letter = alpha.css('div.item-letter::text').extract_first()
            item['letter'] = letter
            brand_list = alpha.css('div.item-brand')
            sort+=1
            for brand in brand_list:
                brand_name = brand.css('a>div::text').extract_first()
                logo = brand.css('img::attr(data-original)').extract_first()
                item['brand_name'] = brand_name
                item['logo'] = 'https://' + logo
                id = brand.css('a>div::attr(data-id)').extract_first()
                item['type'] = False
                url = 'https://car.yiche.com/xuanchegongju/?mid=' + str(id)
                yield scrapy.Request(url, callback=self.pages, meta={'item': copy.deepcopy(item)},dont_filter=True)   
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值