自定义过滤器:
import hashlib
from redis import StrictRedis
from scrapy.dupefilters import RFPDupeFilter
import os
import redis
from w3lib.url import canonicalize_url
class URLRedisFilter(RFPDupeFilter):
""" 只根据url去重"""
def __init__(self, path=None, debug=False):
RFPDupeFilter.__init__(self, path)
self.dupefilter = UrlFilterAndAdd()
def request_seen(self, request):
# 校验,新增2行代码
if self.dupefilter.check_url(request.url):
return True
# 保留中间页面的去重规则不变,不然爬虫在运行过程中容易出现死循环
fp = self.request_fingerprint(request)
if fp in self.fingerprints:
return True
self.fingerprints.add(fp)
if self.file:
self.file.write(fp + os.linesep)
class UrlFilterAndAdd(object):
def __init__(self):
redis_config = {
"host": "127.0.0.1", # redis ip
"port": 6379,