一、根据url判断
import scrapy
from redis import Redis
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from increment1.items import Increment1Item
"""
爬取过的数据跳过
1、通过url判断
2、通过数据指纹判断
"""
class FirstSpider(CrawlSpider):
name = 'first'
start_urls = ['https://www.4567tv.tv/index.php/vod/show/id/7.html']
rules = (
Rule(LinkExtractor(allow=r'/index.php/vod/show/id/7/page/\d+\.html'), callback='parse_item', follow=True),
)
def parse_detail(self, response):
item = response['item']
actor = response.xpath('//div[@class="stui-content__detail"]/p[3]//text()'