2024多模块集成爬虫之webmagic

1 新建模块xsy-vi-server-webmagic

在这里插入图片描述

2 进入依赖

        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
        </dependency>
        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
        </dependency>

3 编写mapper、service

供保存数据
在这里插入图片描述

4 编写解析器

package org.xsy.itgo.news;

import org.apache.commons.collections4.CollectionUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.xsy.itgo.news.domain.CmsContent;
import org.xsy.itgo.news.service.CmsContentService;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import us.codecraft.webmagic.selector.XpathSelector;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;

@Component
public class NewsPageProcessor implements PageProcessor {


    private final CmsContentService contentService;

    @Autowired
    public NewsPageProcessor(CmsContentService contentService){
        this.contentService = contentService;
    }


    /**抓取网站的相关配置,包括编码、抓取间隔、重试次数等*/
    private Site site = Site.me().setRetryTimes(3).setSleepTime(100);

    /**
     * 定制爬虫逻辑的核心接口,在这里编写抽取逻辑
     * @param page
     */
    @Override
    public void process(Page page) {

        /**
         * 通过page.getHtml()可以获取到main函数中Spider.create(new NewsPageProcessor()).addUrl中的地址的网页内容
         * 1、通过$或css()方法获取到该page html下某元素dom
         */
        Selectable selectable = page.getHtml().$(".xpage-content-list").select(
                new XpathSelector("div[@class='column-center-item']")
        );
        List<Selectable> nodes = selectable.nodes();

        /**
         * 获取到指定的dom后,从这些dom中提取元素内容。
         */
        List<Map<String,Object>> newsList = new ArrayList<>();
        for (int i = 1; i <= nodes.size() - 1; i++) {
            Selectable node = nodes.get(i);
            String link = node.$(".tit a", "href").get();
            if(link.contains("politics")){
                String timeStr = link.substring(28,36);
                String title = node.$(".tit a", "text").get();
                Map<String,Object> obj = new HashMap<>();
                SimpleDateFormat formatter1 = new SimpleDateFormat("yyyyMMdd");
                SimpleDateFormat formatter2 = new SimpleDateFormat("yyyy-MM-dd");
                Date date = null;
                String clTime = "";
                try {
                    date = formatter1.parse(timeStr);
                    clTime = formatter2.format(date);
                } catch (ParseException e) {
                    e.printStackTrace();
                }
                obj.put("timeStr",clTime);
                obj.put("title",title);
                obj.put("link",link);
                newsList.add(obj);
            }
        }
        if(CollectionUtils.isNotEmpty(newsList)){
            System.out.println("市政关注:");
            for (int i = 1; i <= newsList.size() - 1; i++) {
                String timeStr = newsList.get(i).get("timeStr") == null ? "" : newsList.get(i).get("timeStr").toString();
                String title = newsList.get(i).get("title") == null ? "" : newsList.get(i).get("title").toString();
                String link = newsList.get(i).get("link") == null ? "" : newsList.get(i).get("link").toString();
                System.out.printf("%d、%s->%s,访问地址:%s%n", i,timeStr,title, link);
            }
        }
        /**处理db*/
        if(CollectionUtils.isNotEmpty(newsList)){
            for (Map<String,Object> map : newsList) {
                String timeStr = map.get("timeStr") == null ? "" : map.get("timeStr").toString();
                String title = map.get("title") == null ? "" : map.get("title").toString();
                String link = map.get("link") == null ? "" : map.get("link").toString();
                CmsContent content = new CmsContent();
                content.setTimestr(timeStr);
                content.setTitle(title);
                content.setUrl(link);
                contentService.saveCmsContent(content);
            }
        }
    }

    @Override
    public Site getSite() {
        return site;
    }

    public void start(){
        Spider.create(new NewsPageProcessor(contentService)).addUrl("http://www.news.cn/politics").thread(2).run();
    }

}

5 集成定时任务

5.1 进入依赖

        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-quartz</artifactId>
        </dependency>

5.2 编写定时任务类

在这里插入图片描述

package org.xsy.itgo.task;

import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import org.xsy.itgo.news.NewsPageProcessor;

@Component
public class MyTak {

    @Autowired
    private NewsPageProcessor pageProcessor;

    //@Scheduled(cron = "0 0/1 * * * ?")
    @Scheduled(fixedRate = 5000)
    public void execute(){
        pageProcessor.start();
    }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值