用WebMagic框架写一个简单的爬虫

爬取的目标
TARGET_URL:http://blog.csdn.net/ycd500756
抓取得内容包括每条博客的(标题,时间,阅读次数)


分析:

这里写图片描述

这里写图片描述

步骤:
1.首先写一个Model类

package com.mark.WebMagic.CSDN;

public class CSDNModel {
    private String title;//标题
    private String view;//阅读次数
    private String time;//时间

    public String getTitle() {
        return title;
    }
    public void setTitle(String title) {
        this.title = title;
    }
    public String getView() {
        return view;
    }
    public void setView(String view) {
        this.view = view;
    }
    public String getTime() {
        return time;
    }
    public void setTime(String time) {
        this.time = time;
    }

}

2.Processer类

package com.mark.WebMagic.CSDN;

import java.util.ArrayList;
import java.util.List;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;

public class CSDNProcesser implements PageProcessor {

    private Site site= new Site();
    public void process(Page page) {
        Html html = page.getHtml();
        //获取扩展其他页的URl
        List<String> links =  html.css("div#papelist").links().all();
        parserCSDNModel(html.getDocument(),page);
        page.addTargetRequests(links);

    }

    //解析页面信息
    private void parserCSDNModel(Document document, Page page) {
        CSDNModel model = null;
        List<CSDNModel> models = new ArrayList<CSDNModel>();
        Element divList = document.getElementById("article_list");
        Elements divs = divList.select("div[class=list_item article_item]");
        for(Element div:divs){
            model = new CSDNModel();
            String title = div.select("div.article_title").text().trim();
            String view = div.select("span.link_view").text().trim();
            String time = div.select("span.link_postdate").text().trim();
            model.setTitle(title);
            model.setView(view);
            model.setTime(time);
            models.add(model);
        }
        page.putField("model",models);
    }

    public Site getSite() {
        site.me().setRetryTimes(3).setSleepTime(100);
        return site;
    }

}

3.Pipeline类

package com.mark.WebMagic.CSDN;

import java.util.List;

import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

public class CSDNPipeLine implements Pipeline {

    public void process(ResultItems resultItems, Task task) {
        List<CSDNModel> models =  resultItems.get("model");
        CSDNDAO.insert(models);
    }

}

4.DAO类(保存数据,这里只是简单的输出而已)

package com.mark.WebMagic.CSDN;

import java.util.List;

public class CSDNDAO {

    public static void insert(List<CSDNModel> models) {
        int count = 0;
        for(CSDNModel model:models){
            count++;
            System.out.println("阅读次数:"+model.getView()+"  发布时间:"+model.getTime()+"   标题:"+model.getTitle());
            System.out.println("----------------------------------------------------------");
        }
        System.out.println(count);
    }

}

4.Test类

package com.mark.WebMagic.CSDN;

import us.codecraft.webmagic.Spider;

public class CSDNProcesserTest {
    private static final String TARGET_URL = "http://blog.csdn.net/ycd500756";
    public static void main(String[] args) {

        Spider.create(new CSDNProcesser())
        .addUrl(TARGET_URL)
        .addPipeline(new CSDNPipeLine())
        .thread(5)
        .run();
    }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值