如果我们要爬去一个网站,而网站的数据又是经常更新的,但是对于爬虫来说,启动爬虫的时候他会认为他爬取的数据是新的,因此,我们需要一个凭证来告诉爬虫这个数据已经存在
movie.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
from zls_movie.items import ZlsMovieItem
class MovieSpider(CrawlSpider):
name = 'movie'
allowed_domains = ['4567tv.tv']
start_urls = ['https://www.4567tv.tv/index.php/vod/show/id/1/page/1.html']
link = LinkExtractor(allow=r'.*show/id/1/page/\d+\.html')
rules = (
Rule(link, callback='parse_item', follow=True),
)
domain = "https://www.4567tv.tv"
conn = Redis(host="127.0.0.1", port=6379)
def parse_item(self, response):
link = response.xpath("//div[@class='stui-vodlist__box']"
"//div[@class='stui-vodlist__detail']//h4//a/@href").extract_first()
ex = self.conn.sadd("movie_url", link)
if ex:
print("数据是新鲜的,请爬取")
title = response.xpath("//div[@class='stui-vodlist__box']"
"//div[@class='stui-vodlist__detail']//h4//a/text()").extract_first()
item = ZlsMovieItem()
item['title'] = title
yield scrapy.Request(self.domain + link, callback=self.parse_detail, meta={"item": item})
else:
print("数据已经存在")
def