java爬虫框架webmagic

学习文档地址:

http://webmagic.io/docs/zh/

一个简单的爬虫实例代码

  1. 导入依赖
    <dependency>
                <groupId>us.codecraft</groupId>
                <artifactId>webmagic-core</artifactId>
                <version>0.7.3</version>
            </dependency>
            <dependency>
                <groupId>us.codecraft</groupId>
                <artifactId>webmagic-extension</artifactId>
                <version>0.7.3</version>
            </dependency>
  2. AnswerController类
    import org.springframework.beans.factory.annotation.Autowired;
    import org.springframework.web.bind.annotation.RequestMapping;
    import org.springframework.web.bind.annotation.RestController;
    
    @RestController
    public class AnswerController {
    
        @Autowired
        private AnswerSpider answerSpider;
    
        @RequestMapping("/startSpider")
        public String startSpider(){
            answerSpider.start();
            return "ok";
        }
    
    }
  3. 自定义PageProcessor
    
    import org.springframework.stereotype.Component;
    import us.codecraft.webmagic.Page;
    import us.codecraft.webmagic.Site;
    import us.codecraft.webmagic.processor.PageProcessor;
    import us.codecraft.webmagic.selector.Html;
    import us.codecraft.webmagic.selector.Selectable;
    
    
    import java.util.List;
    
    @Component
    public class AnswerPageProcessor implements PageProcessor {
    
        private Site site = Site.me().setRetryTimes(3).setSleepTime(1000);
    
        @Override
        public void process(Page page) {
            Html html = page.getHtml();
            String title = html.xpath("//title/text()").toString();
            //判断页面是否是首页
            if (title.contains("首页")) {
                //将首页所有的问题专区链接放入队列
                List<Selectable> nodes = html.xpath("//div[@class='media']").nodes();
                for (Selectable node : nodes) {
                    String href = node.xpath("//a[1]/@href").toString();
                    page.addTargetRequest("http://28.0.000.00:0000" + href);
                }
                //获取下一页列表,将链接放入队列
                List<Selectable> nextNodes = html.xpath("//ul[@class='pagination']/li").nodes();
                Selectable nextSel = nextNodes.get(nextNodes.size() - 2);
                String next = nextSel.xpath("//a[1]/@href").toString();
                page.addTargetRequest("http://28.0.000.00:0000" + next);
                page.getResultItems().setSkip(true);
            } else {
                //执行问题页面分析逻辑
                String question = html.xpath("//div[@class='col-lg-9']/div[1]/h4/span/text()").toString();
                String answer = html.xpath("//div[@id='question-view']/textarea/text()").toString();
                ChatInfo chatInfo = new ChatInfo();
                chatInfo.setKey(question);
                List<Selectable> mediaBodys = html.xpath("//div[@class='media-body']").nodes();
                if (mediaBodys.size() != 0) {
                    for (Selectable mediaBody : mediaBodys) {
                        String body = mediaBody.xpath("//h4/div[1]/text()").toString();
                        answer = answer + body;
                    }
                }
                chatInfo.setWord(answer);
                //将数据存入resultItems
                page.putField("chatInfo",chatInfo);
            }
        }
    
        @Override
        public Site getSite() {
            return site;
        }
    
    }
     
  4.  自定义Pipeline
    import org.springframework.beans.factory.annotation.Autowired;
    import org.springframework.stereotype.Component;
    import us.codecraft.webmagic.ResultItems;
    import us.codecraft.webmagic.Task;
    import us.codecraft.webmagic.pipeline.Pipeline;
    
    @Component
    //自定义Pipeline,将数据持久化数据库
    public class AnswerPipeline implements Pipeline {
        @Autowired
        private ChatMapper chatMapper;
        @Override
        public void process(ResultItems resultItems, Task task) {
            //使用存入时的key取出
            ChatInfo chatInfo = resultItems.get("chatInfo");
            chatMapper.insert(chatInfo.getKey(),chatInfo.getWord());
        }
    }

  5. Spider是爬虫启动的入口
    
    @Component
    public class AnswerSpider {
    
        @Autowired
        private AnswerPageProcessor answerPageProcessor;
        @Autowired
        private AnswerPipeline answerPipeline;
    
        public void start(){
            Spider.create(answerPageProcessor)
                    .addUrl("http://28.0.000.00:0000/index")
                    //添加自定义pipeline,将数据保存数据库
                    //不添加时,使用默认pipeline,会将结果打印在控制台
                    .addPipeline(answerPipeline)
                    .thread(1)
                    .start();
        }
    
    
    }
    spider的配置详细请看学习文档
     
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值