java redis 去重队列_WebMagic增量爬取去重 RedisScheduler队列实现

importjava.util.HashMap;importjava.util.Map;importorg.apache.commons.codec.digest.DigestUtils;importorg.apache.http.NameValuePair;importorg.apache.http.message.BasicNameValuePair;importcom.alibaba.fastjson.JSON;importcom.alibaba.fastjson.JSONArray;importcom.alibaba.fastjson.JSONObject;importredis.clients.jedis.Jedis;importredis.clients.jedis.JedisPool;importredis.clients.jedis.JedisPoolConfig;importus.codecraft.webmagic.Request;importus.codecraft.webmagic.Task;importus.codecraft.webmagic.scheduler.component.DuplicateRemover;/*** Redis中存储着所有抓取到的链接 抓取程序首先判断Redis中是否存在,如果存在该链接直接抛弃. //ERROR** @author CainGao*/publicclassRedisSchedulerextendsDuplicateRemovedSchedulerimplementsMonitorableScheduler, DuplicateRemover {privateJedisPool pool;privatestaticfinalString QUEUE_PREFIX ="queue_";privatestaticfinalString SET_PREFIX ="set_";privatestaticfinalString ITEM_PREFIX ="item_";publicRedisScheduler(String host) {this(newJedisPool(newJedisPoolConfig(), host));}publicRedisScheduler(JedisPool pool) {this.pool = pool;setDuplicateRemover(this);}@OverridepublicvoidresetDuplicateCheck(Task task) {Jedis jedis = pool.getResource();try{jedis.del(getSetKey(task));} finally{pool.returnResource(jedis);}}@OverridepublicbooleanisDuplicate(Request request, Task task) {Jedis jedis = pool.getResource();try{booleanisDuplicate = jedis.sismember(getSetKey(task),request.getUrl());if(!isDuplicate) {jedis.sadd(getSetKey(task), request.getUrl());}returnisDuplicate;} finally{pool.returnResource(jedis);}}@OverrideprotectedvoidpushWhenNoDuplicate(Request request, Task task) {Jedis jedis = pool.getResource();try{jedis.rpush(getQueueKey(task), request.getUrl());if(request.getExtras() !=null) {String field = DigestUtils.shaHex(request.getUrl());String value = JSON.toJSONString(request.getExtra("nameValuePair"));jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);}} finally{pool.returnResource(jedis);}}/*** 移动并且返回队列头部一个元素*/@OverridepublicsynchronizedRequest poll(Task task) {Jedis jedis = pool.getResource();try{String url = jedis.lpop(getQueueKey(task));if(url ==null) {returnnull;}String key = ITEM_PREFIX + task.getUUID();String field = DigestUtils.shaHex(url);String text = jedis.hget(key, field);if(text !=null) {JSONArray array = JSON.parseArray(text);NameValuePair[] nameValuePairs = newNameValuePair[array.size()];for(inti =0; i  map = newHashMap();map.put("nameValuePair", nameValuePairs);r.setMethod("post");r.setExtras(map);returnr;}Request request = newRequest(url);returnrequest;} finally{pool.returnResource(jedis);}}protectedString getSetKey(Task task) {returnSET_PREFIX + task.getUUID();}protectedString getQueueKey(Task task) {returnQUEUE_PREFIX + task.getUUID();}@OverridepublicintgetLeftRequestsCount(Task task) {Jedis jedis = pool.getResource();try{Long size = jedis.llen(getQueueKey(task));returnsize.intValue();} finally{pool.returnResource(jedis);}}@OverridepublicintgetTotalRequestsCount(Task task) {Jedis jedis = pool.getResource();try{Long size = jedis.scard(getQueueKey(task));returnsize.intValue();} finally{pool.returnResource(jedis);}}}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值