- 中间件 - 下载中间件 - 位置:引擎和下载器之间 - 作用:批量拦截到整个工程中所有的请求和响应 - 拦截请求: - UA伪装:process_request - 代理IP:process_exception:return request - 拦截响应: - 篡改响应数据,响应对象
这是我自己随便写的一个爬虫实例,爬取视频。scrapy框架+selenium,因为要实现页面交互。
功能实现的不太完善,主要想是给到一个中间件的使用方法。
以下代码块按照 这个图片中的顺序
import scrapy
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver import ChromeOptions
from douyinPro.items import DouyinproItem
class DouyinSpider(scrapy.Spider):
name = 'douyin'
word=input('请输入目标用户抖音号:')
# allowed_domains = ['www.douyin.com']
start_urls = ['https://www.douyin.com/?enter=guide']
douyin_urls=[]
option = ChromeOptions()
option.add_argument('--headless')
option.add_argument('--disable-gpu')
s = Service("chromedriver.exe")
# bro = webdriver.Chrome(service=s,options=option)无头浏览器
bro = webdriver.Chrome(service=s)
bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
def parse_detail(self, response):
src=response.xpath('//*[@id="root"]/div/div[2]/div/div/div[1]/div[1]/div[2]/div/div[1]/div/div[2]/div[2]/xg-video-container/video/source[1]/@src').extract_first()
src = 'https:' + src
item = DouyinproItem()
item['src'] = src
yield item
def parse(self, response):
li_list=response.xpath('//*[@id="root"]/div/div[2]/div/div/div[4]/div[1]/div[2]/ul/li')
for li in li_list:
cc = li.xpath('./a/@href').extract_first()
cc = 'https:' + cc
self.douyin_urls.append(cc)
for href in self.douyin_urls:
yield scrapy.Request(href, callback=self.parse_detail)
import scrapy
class DouyinproItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
src= scrapy.Field()
pass
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
import random
from selenium.webdriver.common.by import By
from time import sleep
from scrapy.http import HtmlResponse
class DouyinproDownloaderMiddleware:
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "