概述:
webmagic是一个开源的java语言爬虫框架,参考官网http://webmagic.io/,
本篇博客介绍爬取 码云的"最新推荐",网址:https://git.oschina.net/explore/recommend
代码
package leap.crawler;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.JsonFilePipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;
import java.util.List;
/**
* Created by FromX on 2017/4/14.
* 爬取 开源中国
*/
public class OschinaCrawler implements PageProcessor {
private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(10000);
@Override
public Site getSite() {
return site;
}
@Override
public void process(Page page) {
// 爬取项目名, 作者, 描述, 星等
List<String> contents = page.getHtml().xpath("//div[@class=\"item\"]//div[@class='content']").all();
StringBuffer sb = new StringBuffer();
String title="";
String language="";
String attention="";
String collection="";
String fork="";
String desc="";
Html html = null;
int i=0;
for (String content:contents){
html = new Html(content);
if(html.xpath("//div[@class='project-title']/a/text()").equals(null)){
title=" ";
}else{
title=html.xpath("//div[@class='project-title']/a/text()").toString();
}
if(html.xpath("//div[@class='project-title']//div[@class='ui small label lang-label']/a/text()").equals(null)){
language=" ";
}else{
language=html.xpath("//div[@class='project-title']//div[@class='ui small label lang-label']/a/text()").toString();
}
if(html.xpath("//div[@class='project-title']//div[@class='pull-right']//a[@title='关注数']//span/text()").equals(null)){
attention=" ";
}else{
attention=html.xpath("//div[@class='project-title']//div[@class='pull-right']//a[@title='关注数']//span/text()").toString();
}
if(html.xpath("//div[@class='project-title']//div[@class='pull-right']//a[@title='收藏数']//span/text()").equals(null)){
collection=" ";
}else{
collection=html.xpath("//div[@class='project-title']//div[@class='pull-right']//a[@title='收藏数']//span/text()").toString();
}
if(html.xpath("//div[@class='project-title']//div[@class='pull-right']//a[@title='fork数']//span/text()").equals(null)){
fork=" ";
}else{
fork=html.xpath("//div[@class='project-title']//div[@class='pull-right']//a[@title='fork数']//span/text()").toString();
}
if(html.xpath("//div[@class='project-desc']/text()").equals(null)){
desc=" ";
}else{
desc=html.xpath("//div[@class='project-desc']/text()").toString();
}
page.putField("data" + i, "{'title':"+title+",'language':"+language+",'attention':"+attention+"" +
",'collection':"+collection+",'fork':"+fork+",'desc':"+desc+"}");
sb.setLength(0);
i++;
}
// page.putField("language", page.getHtml().xpath("//div[@class=\"item\"]//div[@class='content']//div[@class='project-title']//div[@class='ui small label lang-label']/a/text()").toString());
// page.putField("attention", page.getHtml().xpath("//div[@class=\"item\"]//div[@class='content']//div[@class='project-title']//div[@class='pull-right']//a[@title='关注数']//span/text()").toString());
// page.putField("Collection", page.getHtml().xpath("//div[@class=\"item\"]//div[@class='content']//div[@class='project-title']//div[@class='pull-right']//a[@title='收藏数']//span/text()").toString());
// page.putField("fork", page.getHtml().xpath("//div[@class=\"item\"]//div[@class='content']//div[@class='project-title']//div[@class='pull-right']//a[@title='fork数']//span/text()").toString());
// page.putField("desc", page.getHtml().xpath("//div[@class=\"item\"]//div[@class='content']//div[@class='project-desc']/text()").toString());
// 部分三:从页面发现后续的url地址来抓取
page.addTargetRequests(page.getHtml().xpath("//div[@class=\"ui tiny pagination menu\"]").links().all());
}
public static void main(String[] args) {
Spider.create(new OschinaCrawler())
//从"https://github.com/code4craft"开始抓
.addUrl("https://git.oschina.net/explore/recommend")
.addPipeline(new JsonFilePipeline("D:\\webmagic\\"))
//开启5个线程抓取
.thread(10)
//启动爬虫
.run();
}
}
maven 依赖
<!-- webmagic 爬虫-->
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.6.1</version>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.6.1</version>
</dependency>