Webmgic使用redis去重,实现增量爬取

  1. 重写RedisScheduler,改变URL存储形式
package com.jyft.reptileframework.confg;


import com.alibaba.fastjson.JSON;
import org.apache.commons.codec.digest.DigestUtils;
import redis.clients.jedis.Jedis;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.scheduler.DuplicateRemovedScheduler;
import us.codecraft.webmagic.scheduler.MonitorableScheduler;
import us.codecraft.webmagic.scheduler.component.DuplicateRemover;

/**
 * @Author:wf
 * @Date 2021/6/23 11:03
 * @Describe:自定义RedisScheduler
 **/
public class RedisScheduler extends DuplicateRemovedScheduler implements MonitorableScheduler, DuplicateRemover {

    protected JedisPool pool;

    private static final String QUEUE_PREFIX = "queue_";

    private static final String SET_PREFIX = "set_";

    private static final String ITEM_PREFIX = "item_";

    public RedisScheduler(String host) {
        this(new JedisPool(new JedisPoolConfig(), host));
    }

    public RedisScheduler(JedisPool pool) {
        this.pool = pool;
        setDuplicateRemover(this);
    }

    @Override
    public void resetDuplicateCheck(Task task) {
        Jedis jedis = pool.getResource();
        try {
            jedis.del(getSetKey(task));
        } finally {
            pool.returnResource(jedis);
        }
    }

    @Override
    public boolean isDuplicate(Request request, Task task) {
        Jedis jedis = pool.getResource();
        try {
            return jedis.sadd(getSetKey(task), "\"" + request.getUrl() + "\"") == 0;
        } finally {
            pool.returnResource(jedis);
        }

    }

    @Override
    protected void pushWhenNoDuplicate(Request request, Task task) {
        Jedis jedis = pool.getResource();
        try {
            jedis.rpush(getQueueKey(task), request.getUrl());
            if (request.getExtras() != null) {
                String field = DigestUtils.shaHex(request.getUrl());
                String value = JSON.toJSONString(request);
                jedis.hset((ITEM_PREFIX + task.getUUID()), field, value);
            }
        } finally {
            pool.returnResource(jedis);
        }
    }

    @Override
    public synchronized Request poll(Task task) {
        Jedis jedis = pool.getResource();
        try {
            String url = jedis.lpop(getQueueKey(task));
            if (url == null) {
                return null;
            }
            String key = ITEM_PREFIX + task.getUUID();
            String field = DigestUtils.shaHex(url);
            byte[] bytes = jedis.hget(key.getBytes(), field.getBytes());
            if (bytes != null) {
                Request o = JSON.parseObject(new String(bytes), Request.class);
                return o;
            }
            Request request = new Request(url);
            return request;
        } finally {
            pool.returnResource(jedis);
        }
    }

    protected String getSetKey(Task task) {
        return SET_PREFIX + task.getUUID();
    }

    protected String getQueueKey(Task task) {
        return QUEUE_PREFIX + task.getUUID();
    }

    protected String getItemKey(Task task) {
        return ITEM_PREFIX + task.getUUID();
    }

    @Override
    public int getLeftRequestsCount(Task task) {
        Jedis jedis = pool.getResource();
        try {
            Long size = jedis.llen(getQueueKey(task));
            return size.intValue();
        } finally {
            pool.returnResource(jedis);
        }
    }

    @Override
    public int getTotalRequestsCount(Task task) {
        Jedis jedis = pool.getResource();
        try {
            Long size = jedis.scard(getSetKey(task));
            return size.intValue();
        } finally {
            pool.returnResource(jedis);
        }
    }
}

  1. 编写redis 配置
package com.jyft.reptileframework.confg;


import com.fasterxml.jackson.annotation.JsonAutoDetect;
import com.fasterxml.jackson.annotation.PropertyAccessor;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.springframework.cache.CacheManager;
import org.springframework.cache.annotation.CachingConfigurerSupport;
import org.springframework.cache.annotation.EnableCaching;
import org.springframework.cache.interceptor.KeyGenerator;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.data.redis.cache.RedisCacheManager;
import org.springframework.data.redis.connection.RedisConnectionFactory;
import org.springframework.data.redis.connection.lettuce.LettuceConnectionFactory;
import org.springframework.data.redis.core.RedisTemplate;
import org.springframework.data.redis.core.StringRedisTemplate;
import org.springframework.data.redis.serializer.Jackson2JsonRedisSerializer;
import org.springframework.data.redis.serializer.RedisSerializer;
import org.springframework.data.redis.serializer.StringRedisSerializer;
import redis.clients.jedis.JedisPool;
import redis.clients.jedis.JedisPoolConfig;

import java.lang.reflect.Method;


/**
 * @Author:wf
 * @Date 2021/6/23 9:48
 * @Describe:Redis配置类
 **/
@Configuration
@EnableCaching
public class RedisConfig extends CachingConfigurerSupport {


    /**
     * 缓择redis作为默认缓存工具
     */
    @Bean
    public CacheManager cacheManager(RedisConnectionFactory connectionFactory) {
        RedisCacheManager cacheManager = RedisCacheManager.create(connectionFactory);
        return cacheManager;
    }


    @Bean
    public RedisTemplate<String, Object> redisTemplate(LettuceConnectionFactory lettuceConnectionFactory) {
        // 设置序列化
        Jackson2JsonRedisSerializer<Object> jackson2JsonRedisSerializer = new Jackson2JsonRedisSerializer<Object>(
                Object.class);
        ObjectMapper om = new ObjectMapper();
        om.setVisibility(PropertyAccessor.ALL, JsonAutoDetect.Visibility.ANY);
        om.enableDefaultTyping(ObjectMapper.DefaultTyping.NON_FINAL);
        om.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
        jackson2JsonRedisSerializer.setObjectMapper(om);
        // 配置redisTemplate
        RedisTemplate<String, Object> redisTemplate = new RedisTemplate<String, Object>();
        redisTemplate.setConnectionFactory(lettuceConnectionFactory);
        RedisSerializer<?> stringSerializer = new StringRedisSerializer();
        redisTemplate.setKeySerializer(stringSerializer);// key序列化
        redisTemplate.setValueSerializer(jackson2JsonRedisSerializer);// value序列化
        redisTemplate.setHashKeySerializer(stringSerializer);// Hash key序列化
        redisTemplate.setHashValueSerializer(jackson2JsonRedisSerializer);// Hash value序列化
        redisTemplate.afterPropertiesSet();
        return redisTemplate;
    }

    @Bean
    public JedisPool redisPoolFactory() {

        JedisPoolConfig jedisPoolConfig = new JedisPoolConfig();
        //最大空闲连接数, 默认8个
        jedisPoolConfig.setMaxIdle(JedisConfigProperty.MAX_IDLE);
        //最大连接数, 默认8个
        jedisPoolConfig.setMaxTotal(JedisConfigProperty.MAX_ACTIVE);
        //获取连接时的最大等待毫秒数(如果设置为阻塞时BlockWhenExhausted),如果超时就抛异常, 小于零:阻塞不确定的时间,  默认-1
        jedisPoolConfig.setMaxWaitMillis(JedisConfigProperty.MAX_WAIT_MILLIS);

        return new JedisPool(
                jedisPoolConfig,
                JedisConfigProperty.HOST,
                JedisConfigProperty.PORT,
                JedisConfigProperty.TIMEOUT,
                JedisConfigProperty.PASSWORD,
                JedisConfigProperty.DATABASE);
    }

}

3.编写配置文件读取类

package com.jyft.reptileframework.confg;

import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;

@Component
public  class JedisConfigProperty implements InitializingBean {

    /**
     * 所使用的数据库
     */
    @Value("${spring.redis.database}")
    private Integer database;


    /**
     * 连接地址
     */
    @Value("${spring.redis.host}")
    private String host;

    /**
     * 端口
     */
    @Value("${spring.redis.port}")
    private Integer port;

    /**
     * 超时时间
     */
    @Value("${spring.redis.timeout}")
    private Integer timeout;

    /**
     * 密码
     */
    @Value("${spring.redis.password}")
    private String password;

    /**
     * 连接池最大连接数(使用负值表示没有限制)
     */
    @Value("${spring.redis.jedis.pool.max-active}")
    private int maxActive;

    /**
     * 连接池中的最大空闲连接
     */
    @Value("${spring.redis.jedis.pool.max-idle}")
    private int maxIdle;

    /**
     * 连接池中的最小空闲连接
     */
    @Value("${spring.redis.jedis.pool.min-idle}")
    private int minIdle;

    /**
     * 连接池最大阻塞等待时间(使用负值表示没有限制)
     */
    @Value("${spring.redis.jedis.pool.max-wait}")
    private long maxWaitMillis;

    /**
     * 定义公开静态变量
     */
    public static Integer DATABASE;
    public static String HOST;
    public static Integer PORT;
    public static String PASSWORD;
    public static Integer MAX_ACTIVE;
    public static Integer MAX_IDLE;
    public static Integer MIN_IDLE;
    public static Long MAX_WAIT_MILLIS;
    public static Integer TIMEOUT;

    @Override
    public void afterPropertiesSet() throws Exception {
        DATABASE = database;
        HOST = host;
        PORT = port;
        //redis没有密码时,设置为null,避免传入空字符串
        if("".equals(password)){
            PASSWORD = null;
        }
        MAX_ACTIVE = maxActive;
        MAX_IDLE = maxIdle;
        MIN_IDLE = minIdle;
        MAX_WAIT_MILLIS = maxWaitMillis;
        TIMEOUT = timeout;

    }


}

  1. 使用自定义RedisScheduler
	/**
	在RedisConfig类中注入的Bean
	**/
    @Autowired
    private JedisPool jedisPool;
    
    /**
     * 5秒执行一次
     */
    @Scheduled(cron = "0/5 * * * * ?")
    private void test(){

     
        long startTime = System.currentTimeMillis();
        System.out.println("---------爬虫启动----------");

        /*
         *
         * --------------设置代理服务器
         */
      

        //程序抓取的入口
        Spider spider = Spider.create(bjcxjsProcessor)
                //从这个url开始抓取
                .addUrl(BJCXJS_START_URL)
                //使用redis进行去重,达到增量爬取的目的
               .setScheduler(new RedisScheduler(jedisPool))
                //设置使用布隆过滤器去重
                //.setScheduler(new QueueScheduler().setDuplicateRemover(new BloomFilterDuplicateRemover(10000000)))
                //设置5个线程同时抓取
                .thread(5)
                //使用自己的Pipeline将结果保存到数据库中
                .addPipeline(bjcxjsPipeline)
                //设置完成时退出
                .setExitWhenComplete(true);
        spider.run();
        spider.stop();
        System.out.println("--------爬取结束----------");
        long endTime = System.currentTimeMillis();
        System.out.println("耗时:" + DateUtils.getGapTime(endTime - startTime));
    }

注:在使用WebMgic(0.7.3)版本时,由于其自身引入的jedis版本为2.9.0,该版本过低,会出现如下报错场景
在这里插入图片描述
解决办法为,在引入webmgic的依赖时,将jedis的依赖排除,重新引入,如下

  <!-- webmagic -->
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
            <exclusions>
                <!--此处需要排除自带的jedis,否则会因为版本过低出现报错-->
                <exclusion>
                    <groupId>redis.clients</groupId>
                    <artifactId>jedis</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <!--jedis-->
        <dependency>
            <groupId>redis.clients</groupId>
            <artifactId>jedis</artifactId>
            <version>2.10.2</version>
        </dependency>
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

亿只王菜菜

各位爷,赏口饭吃吧

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值