场景1:项目中有多个爬虫时,将driver对象的初始化放在各个spider中
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@author: carry
@time: 2019/6/19 15:56
@desc:
'''
import scrapy
from selenium import webdriver
from scrapy import signals
from scrapy.xlib.pydispatch import dispatcher
from scrapy.http import HtmlResponse
from selenium.webdriver.chrome.options import Options
class ZhihuSpider(scrapy.Spider):
'''
项目中有多个爬虫时,将driver对象的初始化放在各个spider中
'''
name = 'zhihu'
allowed_domains = ['oschina.net/blog']
start_urls = ['https://www.oschina.net/blog']
def __init__(self):
# 初始化chrome对象
chrome_options = Options()
chrome_options.add_argument('--headless')
self.browser = webdriver.Chrome('D:\chromedriver_win32\chromedriver.exe', chrome_options=chrome_options)
super(ZhihuSpider, self).__init__()
# 当爬虫关闭时scrapy会发出一个spider_closed的信息,当这个信号发出时就调用closeSpider函数关闭这个浏览器.
dispatcher.connect(receiver=self.closeSpider, signal=signals.spider_closed)
def closeSpider(self, spider):
print("spider closed")
self.browser.quit()
def parse(self, response):
pass
class SeleniumMiddleware(object):
'''
Middleware中的类设置
'''
def process_request(self, request, spider):
if spider.name == "zhihu":
spider.browser.get(request.url)
return HtmlResponse(url=spider.browser.current_url, body=spider.browser.page_source, encoding="utf-8",
request=request)
场景2:当需要用selenium进行 键入搜索,而此时得到的是list,需循环点击获取详情页
def use_selenium_step01(self, request, spider): ''' 键入搜索逻辑,返回搜索结果页面到 ''' driver = spider.browser driver.get(request.url) try: input = spider.browser.find_element_by_xpath('//textarea') input.send_keys(request.meta['keyword']) btn_submit = spider.browser.find_element_by_xpath( '//span[contains(text(),"Find")]/..') btn_submit.click() WebDriverWait(spider.browser, 5, 0.5).until( EC.presence_of_element_located( (By.XPATH, '//div[@class="ui-datatable-tablewrapper"]/table/tbody'))) except Exception as e: print(e) return def use_selenium_setp02(self, request, spider): ''' 此为点击list页面每个详情链接的处理逻辑 在parser_01中得到1获取每个链接的唯一Xpath定位标识 ''' driver = spider.browser driver.get(request.url) try: # 重复键入搜索逻辑 input = spider.browser.find_element_by_xpath('//textarea') input.send_keys(request.meta['keyword']) btn_submit = spider.browser.find_element_by_xpath( '//span[contains(text(),"Find")]/..') btn_submit.click() WebDriverWait(spider.browser, 5, 0.5).until( EC.presence_of_element_located( (By.XPATH, '//div[@class="ui-datatable-tablewrapper"]/table/tbody'))) #接着点击所要获取的每个链接 btn_submit = driver.find_element_by_xpath( '//div[@class="ui-datatable-tablewrapper"]/table/tbody/tr[{}]//a'.format(request.meta['i'])) btn_submit.click() WebDriverWait(spider.browser, 5, 0.5).until( EC.presence_of_element_located( (By.XPATH, '//span[contains(text(),"Back")]'))) except Exception as e: print(e) return
----------------------------------------------------我是分割线--------------------------------------------------------------------------------------------
欢迎关注博主个人公众号,一起来嗨皮呀