- import java.util.HashMap;
- import java.util.Map;
- import org.apache.commons.codec.digest.DigestUtils;
- import org.apache.http.NameValuePair;
- import org.apache.http.message.BasicNameValuePair;
- import com.alibaba.fastjson.JSON;
- import com.alibaba.fastjson.JSONArray;
- import com.alibaba.fastjson.JSONObject;
- import redis.clients.jedis.Jedis;
- import redis.clients.jedis.JedisPool;
- import redis.clients.jedis.JedisPoolConfig;
- import us.codecraft.webmagic.Request;
- import us.codecraft.webmagic.Task;
- import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
- /**
- * Redis中存储着所有抓取到的链接 抓取程序首先判断Redis中是否存在,如果存在该链接直接抛弃. //ERROR
- *
- * @author CainGao
- */
- public class RedisScheduler extends DuplicateRemovedScheduler implements
- MonitorableScheduler, DuplicateRemover {
- private JedisPool pool;
- private static final String QUEUE_PREFIX = "queue_";
- private static final String SET_PREFIX = "set_";
- private static final String ITEM_PREFIX = "item_";
- public RedisScheduler(String host) {
- this(new JedisPool(new JedisPoolConfig(), host));
- }
- public RedisScheduler(JedisPool pool) {
- this.pool = pool;
- setDuplicateRemover(this);
- }
- @Override
- public void resetDuplicateCheck(Task task) {
- Jedis jedis = pool.getResource();
- try {
- jedis.del(getSetKey(task));
- } finally {
- pool.returnResource(jedis);
- }
- }
- @Override
- public boolean isDuplicate(Request request, Task task) {
- Jedis jedis = pool.getResource();
- try {
- boolean isDuplicate = jedis.sismember(getSetKey(task),
- request.getUrl());
- if (!isDuplicate) {
- jedis.sadd(getSetKey(task), request.getUrl());
- }
- return isDuplicate;
- } finally {
- pool.returnResource(jedis);
- }
- }
- @Override
- protected void pushWhenNoDuplicate(Request request, Task task) {
- Jedis jedis = pool.getResource();
- try {
- jedis.rpush(getQueueKey(task), request.getUrl());
- if (request.getExtras() != null) {
- String field = DigestUtils.shaHex(request.getUrl());
- String value = JSON.toJSONString(request
- .getExtra("nameValuePair"));
- jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
- }
- } finally {
- pool.returnResource(jedis);
- }
- }
- /**
- * 移动并且返回队列头部一个元素
- */
- @Override
- public synchronized Request poll(Task task) {
- Jedis jedis = pool.getResource();
- try {
- String url = jedis.lpop(getQueueKey(task));
- if (url == null) {
- return null;
- }
- String key = ITEM_PREFIX + task.getUUID();
- String field = DigestUtils.shaHex(url);
- String text = jedis.hget(key, field);
- if (text != null) {
- JSONArray array = JSON.parseArray(text);
- NameValuePair[] nameValuePairs = new NameValuePair[array.size()];
- for (int i = 0; i < array.size(); i++) {
- JSONObject json = JSONObject
- .parseObject(array.getString(i));
- nameValuePairs[i] = new BasicNameValuePair(
- json.getString("name"), json.getString("value"));
- }
- Request r = new Request(url);
- Map<String, Object> map = new HashMap<String, Object>();
- map.put("nameValuePair", nameValuePairs);
- r.setMethod("post");
- r.setExtras(map);
- return r;
- }
- Request request = new Request(url);
- return request;
- } finally {
- pool.returnResource(jedis);
- }
- }
- protected String getSetKey(Task task) {
- return SET_PREFIX + task.getUUID();
- }
- protected String getQueueKey(Task task) {
- return QUEUE_PREFIX + task.getUUID();
- }
- @Override
- public int getLeftRequestsCount(Task task) {
- Jedis jedis = pool.getResource();
- try {
- Long size = jedis.llen(getQueueKey(task));
- return size.intValue();
- } finally {
- pool.returnResource(jedis);
- }
- }
- @Override
- public int getTotalRequestsCount(Task task) {
- Jedis jedis = pool.getResource();
- try {
- Long size = jedis.scard(getQueueKey(task));
- return size.intValue();
- } finally {
- pool.returnResource(jedis);
- }
- }
- }
WebMagic增量爬取去重 RedisScheduler队列实现
最新推荐文章于 2024-02-23 10:09:38 发布