爬取的目标:
TARGET_URL:http://blog.csdn.net/ycd500756
抓取得内容包括每条博客的(标题,时间,阅读次数)
分析:
步骤:
1.首先写一个Model类
package com.mark.WebMagic.CSDN;
public class CSDNModel {
private String title;//标题
private String view;//阅读次数
private String time;//时间
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getView() {
return view;
}
public void setView(String view) {
this.view = view;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
}
2.Processer类
package com.mark.WebMagic.CSDN;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
public class CSDNProcesser implements PageProcessor {
private Site site= new Site();
public void process(Page page) {
Html html = page.getHtml();
//获取扩展其他页的URl
List<String> links = html.css("div#papelist").links().all();
parserCSDNModel(html.getDocument(),page);
page.addTargetRequests(links);
}
//解析页面信息
private void parserCSDNModel(Document document, Page page) {
CSDNModel model = null;
List<CSDNModel> models = new ArrayList<CSDNModel>();
Element divList = document.getElementById("article_list");
Elements divs = divList.select("div[class=list_item article_item]");
for(Element div:divs){
model = new CSDNModel();
String title = div.select("div.article_title").text().trim();
String view = div.select("span.link_view").text().trim();
String time = div.select("span.link_postdate").text().trim();
model.setTitle(title);
model.setView(view);
model.setTime(time);
models.add(model);
}
page.putField("model",models);
}
public Site getSite() {
site.me().setRetryTimes(3).setSleepTime(100);
return site;
}
}
3.Pipeline类
package com.mark.WebMagic.CSDN;
import java.util.List;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
public class CSDNPipeLine implements Pipeline {
public void process(ResultItems resultItems, Task task) {
List<CSDNModel> models = resultItems.get("model");
CSDNDAO.insert(models);
}
}
4.DAO类(保存数据,这里只是简单的输出而已)
package com.mark.WebMagic.CSDN;
import java.util.List;
public class CSDNDAO {
public static void insert(List<CSDNModel> models) {
int count = 0;
for(CSDNModel model:models){
count++;
System.out.println("阅读次数:"+model.getView()+" 发布时间:"+model.getTime()+" 标题:"+model.getTitle());
System.out.println("----------------------------------------------------------");
}
System.out.println(count);
}
}
4.Test类
package com.mark.WebMagic.CSDN;
import us.codecraft.webmagic.Spider;
public class CSDNProcesserTest {
private static final String TARGET_URL = "http://blog.csdn.net/ycd500756";
public static void main(String[] args) {
Spider.create(new CSDNProcesser())
.addUrl(TARGET_URL)
.addPipeline(new CSDNPipeLine())
.thread(5)
.run();
}
}