importjava.util.HashMap;importjava.util.Map;importorg.apache.commons.codec.digest.DigestUtils;importorg.apache.http.NameValuePair;importorg.apache.http.message.BasicNameValuePair;importcom.alibaba.fastjson.JSON;importcom.alibaba.fastjson.JSONArray;importcom.alibaba.fastjson.JSONObject;importredis.clients.jedis.Jedis;importredis.clients.jedis.JedisPool;importredis.clients.jedis.JedisPoolConfig;importus.codecraft.webmagic.Request;importus.codecraft.webmagic.Task;importus.codecraft.webmagic.scheduler.component.DuplicateRemover;/*** Redis中存储着所有抓取到的链接 抓取程序首先判断Redis中是否存在,如果存在该链接直接抛弃. //ERROR** @author CainGao*/publicclassRedisSchedulerextendsDuplicateRemovedSchedulerimplementsMonitorableScheduler, DuplicateRemover {privateJedisPool pool;privatestaticfinalString QUEUE_PREFIX ="queue_";privatestaticfinalString SET_PREFIX ="set_";privatestaticfinalString ITEM_PREFIX ="item_";publicRedisScheduler(String host) {this(newJedisPool(newJedisPoolConfig(), host));}publicRedisScheduler(JedisPool pool) {this.pool = pool;setDuplicateRemover(this);}@OverridepublicvoidresetDuplicateCheck(Task task) {Jedis jedis = pool.getResource();try{jedis.del(getSetKey(task));} finally{pool.returnResource(jedis);}}@OverridepublicbooleanisDuplicate(Request request, Task task) {Jedis jedis = pool.getResource();try{booleanisDuplicate = jedis.sismember(getSetKey(task),request.getUrl());if(!isDuplicate) {jedis.sadd(getSetKey(task), request.getUrl());}returnisDuplicate;} finally{pool.returnResource(jedis);}}@OverrideprotectedvoidpushWhenNoDuplicate(Request request, Task task) {Jedis jedis = pool.getResource();try{jedis.rpush(getQueueKey(task), request.getUrl());if(request.getExtras() !=null) {String field = DigestUtils.shaHex(request.getUrl());String value = JSON.toJSONString(request.getExtra("nameValuePair"));jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);}} finally{pool.returnResource(jedis);}}/*** 移动并且返回队列头部一个元素*/@OverridepublicsynchronizedRequest poll(Task task) {Jedis jedis = pool.getResource();try{String url = jedis.lpop(getQueueKey(task));if(url ==null) {returnnull;}String key = ITEM_PREFIX + task.getUUID();String field = DigestUtils.shaHex(url);String text = jedis.hget(key, field);if(text !=null) {JSONArray array = JSON.parseArray(text);NameValuePair[] nameValuePairs = newNameValuePair[array.size()];for(inti =0; i map = newHashMap();map.put("nameValuePair", nameValuePairs);r.setMethod("post");r.setExtras(map);returnr;}Request request = newRequest(url);returnrequest;} finally{pool.returnResource(jedis);}}protectedString getSetKey(Task task) {returnSET_PREFIX + task.getUUID();}protectedString getQueueKey(Task task) {returnQUEUE_PREFIX + task.getUUID();}@OverridepublicintgetLeftRequestsCount(Task task) {Jedis jedis = pool.getResource();try{Long size = jedis.llen(getQueueKey(task));returnsize.intValue();} finally{pool.returnResource(jedis);}}@OverridepublicintgetTotalRequestsCount(Task task) {Jedis jedis = pool.getResource();try{Long size = jedis.scard(getQueueKey(task));returnsize.intValue();} finally{pool.returnResource(jedis);}}}
java redis 去重队列_WebMagic增量爬取去重 RedisScheduler队列实现
最新推荐文章于 2023-07-26 11:23:56 发布