文章目录
基于数据指纹的增量式
概念
- 检测网站数据更新的内容
- 核心:去重
- url
- 数据指纹
增量式爬虫 :电影名称与电影类型的爬取
url:https://www.4567tv.co/list/index1.html
新建文件夹:
4567tv
在文本框中改为cmd回车
在cmd中写入命令
定位
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
class DySpider(CrawlSpider):
conn = Redis('127.0.0.1',6379)
name = 'dy'
allowed_domains = ['www.baidu.com']
start_urls = ['https://www.4567tv.co/list/index1.html']
link = LinkExtractor(allow=r'/list/index1-\d+\.html')
rules = (
Rule(link, callback='parse_item', follow=True),
)
def parse_item(self, response):
li_list = response.xpath('//div[contains(@class,"index-area")]/ul/li')
for li in li_list:
my_link='https://www.4567tv.co' + li.xpath('./a/@href').extract_first()
#向redis的集合中添加数据时,如果数据不存在,返回1,如果数据存在,返回0
ret = self.conn.sadd('mv_link',my_link)
if ret:
scrapy.Request(url=my_link)
else:
print('没有数据更新,无需爬取')
左边redis运行 右边的 查看
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from redis import Redis
from zls.items import ZlsItem
class AvSpider(CrawlSpider):
conn = Redis('127.0.0.1', 6379)
name = 'av'
# allowed_domains = ['www.baidu.com']
start_urls = ['https://www.4567tv.co/list/index1.html']
link = LinkExtractor(allow=r'/list/index1-\d+\.html')
rules = (
Rule(link, callback='parse_item', follow=True),
)
def parse_item(self, response):
print(11111111111111111111111111111111)
print(response)
li_list = response.xpath('//div[contains(@class,"index-area")]/ul/li')
for li in li_list:
mv_link = 'https://www.4567tv.co' + li.xpath('./a/@href').extract_first()
# 向redis的集合中添加数据时, 如果数据不存在, 返回1, 如果数据存在, 返回0
ret = self.conn.sadd('mv_link', mv_link)
if ret:
print('有数据更新......................................')
yield scrapy.Request(url=mv_link, callback=self.parse_detail)
else:
print('没有数据更新, 无需爬取!!!!!!!!!!!!!!!!!!!!!!!!!!!')
def parse_detail(self, response):
title = response.xpath('//h1[@class="title"]/text()').extract_first()
item = ZlsItem()
item['title'] = title
print(item)
yield item