package us.codecraft.webmagic.scheduler;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.http.NameValuePair;
import org.apache.http.message.BasicNameValuePair;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
/**
* Redis中存储着所有抓取到的链接 抓取程序首先判断Redis中是否存在,如果存在该链接直接抛弃. //ERROR
*
* @author CainGao
*/
public class RedisScheduler extends DuplicateRemovedScheduler implements
MonitorableScheduler, DuplicateRemover {
private JedisPool pool;
private static final String QUEUE_PREFIX = "queue_";
private static final String SET_PREFIX = "set_";
private static final String ITEM_PREFIX = "item_";
public RedisScheduler(String host) {
this(new JedisPool(new JedisPoolConfig(), host));
}
public RedisScheduler(JedisPool pool) {
this.pool = pool;
setDuplicateRemover(this);
}
@Override
public void resetDuplicateCheck(Task task) {
Jedis jedis = pool.getResource();
try {
jedis.del(getSetKey(task));
} finally {
pool.returnResource(jedis);
}
}
@Override
public boolean isDuplicate(Request request, Task task) {
Jedis jedis = pool.getResource();
try {
boolean isDuplicate = jedis.sismember(getSetKey(task),
request.getUrl());
if (!isDuplicate) {
jedis.sadd(getSetKey(task), request.getUrl());
}
return isDuplicate;
} finally {
pool.returnResource(jedis);
}
}
@Override
protected void pushWhenNoDuplicate(Request request, Task task) {
Jedis jedis = pool.getResource();
try {
jedis.rpush(getQueueKey(task), request.getUrl());
if (request.getExtras() != null) {
String field = DigestUtils.shaHex(request.getUrl());
String value = JSON.toJSONString(request
.getExtra("nameValuePair"));
jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
}
} finally {
pool.returnResource(jedis);
}
}
/**
* 移动并且返回队列头部一个元素
*/
@Override
public synchronized Request poll(Task task) {
Jedis jedis = pool.getResource();
try {
String url = jedis.lpop(getQueueKey(task));
if (url == null) {
return null;
}
String key = ITEM_PREFIX + task.getUUID();
String field = DigestUtils.shaHex(url);
String text = jedis.hget(key, field);
if (text != null) {
JSONArray array = JSON.parseArray(text);
NameValuePair[] nameValuePairs = new NameValuePair[array.size()];
for (int i = 0; i < array.size(); i++) {
JSONObject json = JSONObject
.parseObject(array.getString(i));
nameValuePairs[i] = new BasicNameValuePair(
json.getString("name"), json.getString("value"));
}
Request r = new Request(url);
Map<String, Object> map = new HashMap<String, Object>();
map.put("nameValuePair", nameValuePairs);
r.setMethod("post");
r.setExtras(map);
return r;
}
Request request = new Request(url);
return request;
} finally {
pool.returnResource(jedis);
}
}
protected String getSetKey(Task task) {
return SET_PREFIX + task.getUUID();
}
protected String getQueueKey(Task task) {
return QUEUE_PREFIX + task.getUUID();
}
@Override
public int getLeftRequestsCount(Task task) {
Jedis jedis = pool.getResource();
try {
Long size = jedis.llen(getQueueKey(task));
return size.intValue();
} finally {
pool.returnResource(jedis);
}
}
@Override
public int getTotalRequestsCount(Task task) {
Jedis jedis = pool.getResource();
try {
Long size = jedis.scard(getQueueKey(task));
return size.intValue();
} finally {
pool.returnResource(jedis);
}
}
}
基于作者@黄亿华 的RedisScheduler 修改POST请求中的强制类型转换bug. 修改了在Redis中的存储方式,仅保存NameValuePair(业务需要).
对于下一步的开发已经写在了class title 中... 仅仅持久化抓取队列,就会造成列表页也无法再次抓取.URL仅仅抓取一遍.无法满足业务需要.