详细参考
一、信号
-
Django
models.pyfrom django.db import models class User(models.Model): title = models.CharField(max_length=32)
from django.shortcuts import render,HttpResponse from app01 import models def func1(request): # models.User.objects.create(title='老男孩') return HttpResponse('创建成功') def func2(request): models.User.objects.create(title='小男孩') return HttpResponse('创建成功') def func3(request): models.User.objects.create(title='小少年') return HttpResponse('创建成功') def func4(request): models.User.objects.create(title='小青年') return HttpResponse('创建成功')
__ init __.py(与settings同级的目录)
from django.db.models import signals def before_save1(*args,**kwargs): print('有车来了,我要服务了--》',args,kwargs) def before_save2(*args,**kwargs): print('有车来了,我要服务了--》',args,kwargs) def after_save1(*args,**kwargs): print('有车来了,完事了--》',args,kwargs) signals.pre_save.connect(before_save1) signals.pre_save.connect(before_save2) signals.post_save.connect(after_save1)
-
Flask
app.pyfrom flask import Flask,render_template from flask import signals app = Flask(__name__) def x1(arg): print('x1') def x2(arg): print('x2') signals.request_started.connect(x1) signals.request_started.connect(x2) # @app.before_request # def bf(): # print('bbbbb') # return render_template("asdfadf") @app.route('/index') def func(): print('视图函数') return "asdfasdf" if __name__ == '__main__': # app.__call__ app.run() # with app.app_context(): # pass
-
Scrapy
ext.py(与settings同级的目录下)
from scrapy import signals class MyExtend(object): def __init__(self): pass @classmethod def from_crawler(cls, crawler): self = cls() crawler.signals.connect(self.x1, signal=signals.spider_opened) crawler.signals.connect(self.x2, signal=signals.spider_closed) return self def x1(self, spider): print('open') def x2(self, spider): print('close')
配置:
EXTENSIONS = { 'xdb.ext.MyExtend':666, }
二、scrapy-redis
基于scrapy-redis的去重规则
-
完全自定义
from scrapy.dupefilter import BaseDupeFilter import redis from scrapy.utils.request import request_fingerprint class DupFilter(BaseDupeFilter): def __init__(self): self.conn = redis.Redis(host='140.143.227.206',port=8888,password='beta') def request_seen(self, request): """ 检测当前请求是否已经被访问过 :param request: :return: True表示已经访问过;False表示未访问过 """ fid = request_fingerprint(request) result = self.conn.sadd('visited_urls', fid) if result == 1: return False return True
-
继承scrapy-redis 实现自定制
from scrapy_redis.dupefilter import RFPDupeFilter from scrapy_redis.connection import get_redis_from_settings from scrapy_redis import defaults class RedisDupeFilter(RFPDupeFilter): @classmethod def from_settings(cls, settings): """Returns an instance from given settings. This uses by default the key ``dupefilter:<timestamp>``. When using the ``scrapy_redis.scheduler.Scheduler`` class, this method is not used as it needs to pass the spider name in the key. Parameters ---------- settings : scrapy.settings.Settings Returns ------- RFPDupeFilter A RFPDupeFilter instance. """ server = get_redis_from_settings(settings) # XXX: This creates one-time key. needed to support to use this # class as standalone dupefilter with scrapy's default scheduler # if scrapy passes spider on open() method this wouldn't be needed # TODO: Use SCRAPY_JOB env as default and fallback to timestamp. key = defaults.DUPEFILTER_KEY % {'timestamp': 'xiaodongbei'} debug = settings.getbool('DUPEFILTER_DEBUG') return cls(server, key=key, debug=debug)
配置:
# ############### scrapy redis连接 #################### REDIS_HOST = '140.143.227.206' # 主机名 REDIS_PORT = 8888 # 端口 REDIS_PARAMS = {'password':'beta'} # Redis连接参数 默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,}) REDIS_ENCODING = "utf-8" # redis编码类型 默认:'utf-8' # REDIS_URL = 'redis://user:pass@hostname:9001' # 连接URL(优先于以上配置) # ############### scrapy redis去重 #################### DUPEFILTER_KEY = 'dupefilter:%(timestamp)s' # DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' DUPEFILTER_CLASS = 'dbd.xxx.RedisDupeFilter'