1、Pipleline 加入如下代码:(在数据爬完后将URL塞入redis去重)
class RedisInsert(object):
def process_item(self,item,spider):
set_redis_values_1(item['url'])
return item
2、Middleware加入如下代码:(在爬数据之前查看该URL是否爬取过)
class IngoreRequestMiddleware(object):
def __init__(self):
self.middlewareLogging=getLogger("IngoreRequestMiddleware")
def process_request(self,request,spider):
if get_redis_values_1(request.url):
self.middlewareLogging.debug("IgnoreRequest : %s" % request.url)
raise IgnoreRequest("IgnoreRequest : %s" % request.url)
else:
self.middlewareLogging.debug("haveRequest : %s" % request.url)
return None
ok!!!