- 重写RedisScheduler,改变URL存储形式
package com.jyft.reptileframework.confg;
import com.alibaba.fastjson.JSON;
import org.apache.commons.codec.digest.DigestUtils;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.DuplicateRemovedScheduler;
import us.codecraft.webmagic.scheduler.MonitorableScheduler;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;
/**
* @Author:wf
* @Date 2021/6/23 11:03
* @Describe:自定义RedisScheduler
**/
public class RedisScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, DuplicateRemover {
protected JedisPool pool;
private static final String QUEUE_PREFIX = "queue_";
private static final String SET_PREFIX = "set_";
private static final String ITEM_PREFIX = "item_";
public RedisScheduler(String host) {
this(new JedisPool(new JedisPoolConfig(), host));
}
public RedisScheduler(JedisPool pool) {
this.pool = pool;
setDuplicateRemover(this);
}
@Override
public void resetDuplicateCheck(Task task) {
Jedis jedis = pool.getResource();
try {
jedis.del(getSetKey(task));
} finally {
pool.returnResource(jedis);
}
}
@Override
public boolean isDuplicate(Request request, Task task) {
Jedis jedis = pool.getResource();
try {
return jedis.sadd(getSetKey(task), "\"" + request.getUrl() + "\"") == 0;
} finally {
pool.returnResource(jedis);
}
}
@Override
protected void pushWhenNoDuplicate(Request request, Task task) {
Jedis jedis = pool.getResource();
try {
jedis.rpush(getQueueKey(task), request.getUrl());
if (request.getExtras() != null) {
String field = DigestUtils.shaHex(request.getUrl());
String value = JSON.toJSONString(request);
jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
}
} finally {
pool.returnResource(jedis);
}
}
@Override
public synchronized Request poll(Task task) {
Jedis jedis = pool.getResource();
try {
String url = jedis.lpop(getQueueKey(task));
if (url == null) {
return null;
}
String key = ITEM_PREFIX + task.getUUID();
String field = DigestUtils.shaHex(url);
byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
if (bytes != null) {
Request o = JSON.parseObject(new String(bytes), Request.class);
return o;
}
Request request = new Request(url);
return request;
} finally {
pool.returnResource(jedis);
}
}
protected String getSetKey(Task task) {
return SET_PREFIX + task.getUUID();
}
protected String getQueueKey(Task task) {
return QUEUE_PREFIX + task.getUUID();
}
protected String getItemKey(Task task) {
return ITEM_PREFIX + task.getUUID();
}
@Override
public int getLeftRequestsCount(Task task) {
Jedis jedis = pool.getResource();
try {
Long size = jedis.llen(getQueueKey(task));
return size.intValue();
} finally {
pool.returnResource(jedis);
}
}
@Override
public int getTotalRequestsCount(Task task) {
Jedis jedis = pool.getResource();
try {
Long size = jedis.scard(getSetKey(task));
return size.intValue();
} finally {
pool.returnResource(jedis);
}
}
}
- 编写redis 配置
package com.jyft.reptileframework.confg;
import com.fasterxml.jackson.annotation.JsonAutoDetect;
import com.fasterxml.jackson.annotation.PropertyAccessor;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.springframework.cache.CacheManager;
import org.springframework.cache.annotation.CachingConfigurerSupport;
import org.springframework.cache.annotation.EnableCaching;
import org.springframework.cache.interceptor.KeyGenerator;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.data.redis.cache.RedisCacheManager;
import org.springframework.data.redis.connection.RedisConnectionFactory;
import org.springframework.data.redis.connection.lettuce.LettuceConnectionFactory;
import org.springframework.data.redis.core.RedisTemplate;
import org.springframework.data.redis.core.StringRedisTemplate;
import org.springframework.data.redis.serializer.Jackson2JsonRedisSerializer;
import org.springframework.data.redis.serializer.RedisSerializer;
import org.springframework.data.redis.serializer.StringRedisSerializer;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import java.lang.reflect.Method;
/**
* @Author:wf
* @Date 2021/6/23 9:48
* @Describe:Redis配置类
**/
@Configuration
@EnableCaching
public class RedisConfig extends CachingConfigurerSupport {
/**
* 缓择redis作为默认缓存工具
*/
@Bean
public CacheManager cacheManager(RedisConnectionFactory connectionFactory) {
RedisCacheManager cacheManager = RedisCacheManager.create(connectionFactory);
return cacheManager;
}
@Bean
public RedisTemplate<String, Object> redisTemplate(LettuceConnectionFactory lettuceConnectionFactory) {
// 设置序列化
Jackson2JsonRedisSerializer<Object> jackson2JsonRedisSerializer = new Jackson2JsonRedisSerializer<Object>(
Object.class);
ObjectMapper om = new ObjectMapper();
om.setVisibility(PropertyAccessor.ALL, JsonAutoDetect.Visibility.ANY);
om.enableDefaultTyping(ObjectMapper.DefaultTyping.NON_FINAL);
om.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
jackson2JsonRedisSerializer.setObjectMapper(om);
// 配置redisTemplate
RedisTemplate<String, Object> redisTemplate = new RedisTemplate<String, Object>();
redisTemplate.setConnectionFactory(lettuceConnectionFactory);
RedisSerializer<?> stringSerializer = new StringRedisSerializer();
redisTemplate.setKeySerializer(stringSerializer);// key序列化
redisTemplate.setValueSerializer(jackson2JsonRedisSerializer);// value序列化
redisTemplate.setHashKeySerializer(stringSerializer);// Hash key序列化
redisTemplate.setHashValueSerializer(jackson2JsonRedisSerializer);// Hash value序列化
redisTemplate.afterPropertiesSet();
return redisTemplate;
}
@Bean
public JedisPool redisPoolFactory() {
JedisPoolConfig jedisPoolConfig = new JedisPoolConfig();
//最大空闲连接数, 默认8个
jedisPoolConfig.setMaxIdle(JedisConfigProperty.MAX_IDLE);
//最大连接数, 默认8个
jedisPoolConfig.setMaxTotal(JedisConfigProperty.MAX_ACTIVE);
//获取连接时的最大等待毫秒数(如果设置为阻塞时BlockWhenExhausted),如果超时就抛异常, 小于零:阻塞不确定的时间, 默认-1
jedisPoolConfig.setMaxWaitMillis(JedisConfigProperty.MAX_WAIT_MILLIS);
return new JedisPool(
jedisPoolConfig,
JedisConfigProperty.HOST,
JedisConfigProperty.PORT,
JedisConfigProperty.TIMEOUT,
JedisConfigProperty.PASSWORD,
JedisConfigProperty.DATABASE);
}
}
3.编写配置文件读取类
package com.jyft.reptileframework.confg;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
@Component
public class JedisConfigProperty implements InitializingBean {
/**
* 所使用的数据库
*/
@Value("${spring.redis.database}")
private Integer database;
/**
* 连接地址
*/
@Value("${spring.redis.host}")
private String host;
/**
* 端口
*/
@Value("${spring.redis.port}")
private Integer port;
/**
* 超时时间
*/
@Value("${spring.redis.timeout}")
private Integer timeout;
/**
* 密码
*/
@Value("${spring.redis.password}")
private String password;
/**
* 连接池最大连接数(使用负值表示没有限制)
*/
@Value("${spring.redis.jedis.pool.max-active}")
private int maxActive;
/**
* 连接池中的最大空闲连接
*/
@Value("${spring.redis.jedis.pool.max-idle}")
private int maxIdle;
/**
* 连接池中的最小空闲连接
*/
@Value("${spring.redis.jedis.pool.min-idle}")
private int minIdle;
/**
* 连接池最大阻塞等待时间(使用负值表示没有限制)
*/
@Value("${spring.redis.jedis.pool.max-wait}")
private long maxWaitMillis;
/**
* 定义公开静态变量
*/
public static Integer DATABASE;
public static String HOST;
public static Integer PORT;
public static String PASSWORD;
public static Integer MAX_ACTIVE;
public static Integer MAX_IDLE;
public static Integer MIN_IDLE;
public static Long MAX_WAIT_MILLIS;
public static Integer TIMEOUT;
@Override
public void afterPropertiesSet() throws Exception {
DATABASE = database;
HOST = host;
PORT = port;
//redis没有密码时,设置为null,避免传入空字符串
if("".equals(password)){
PASSWORD = null;
}
MAX_ACTIVE = maxActive;
MAX_IDLE = maxIdle;
MIN_IDLE = minIdle;
MAX_WAIT_MILLIS = maxWaitMillis;
TIMEOUT = timeout;
}
}
- 使用自定义RedisScheduler
/**
在RedisConfig类中注入的Bean
**/
@Autowired
private JedisPool jedisPool;
/**
* 5秒执行一次
*/
@Scheduled(cron = "0/5 * * * * ?")
private void test(){
long startTime = System.currentTimeMillis();
System.out.println("---------爬虫启动----------");
/*
*
* --------------设置代理服务器
*/
//程序抓取的入口
Spider spider = Spider.create(bjcxjsProcessor)
//从这个url开始抓取
.addUrl(BJCXJS_START_URL)
//使用redis进行去重,达到增量爬取的目的
.setScheduler(new RedisScheduler(jedisPool))
//设置使用布隆过滤器去重
//.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000000)))
//设置5个线程同时抓取
.thread(5)
//使用自己的Pipeline将结果保存到数据库中
.addPipeline(bjcxjsPipeline)
//设置完成时退出
.setExitWhenComplete(true);
spider.run();
spider.stop();
System.out.println("--------爬取结束----------");
long endTime = System.currentTimeMillis();
System.out.println("耗时:" + DateUtils.getGapTime(endTime - startTime));
}
注:在使用WebMgic(0.7.3)版本时,由于其自身引入的jedis版本为2.9.0,该版本过低,会出现如下报错场景
解决办法为,在引入webmgic的依赖时,将jedis的依赖排除,重新引入,如下
<!-- webmagic -->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
<exclusions>
<!--此处需要排除自带的jedis,否则会因为版本过低出现报错-->
<exclusion>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
</exclusion>
</exclusions>
</dependency>
<!--jedis-->
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
<version>2.10.2</version>
</dependency>