APP.java
package Gecco;
import com.geccocrawler.gecco.GeccoEngine;
import com.geccocrawler.gecco.annotation.Gecco;
import com.geccocrawler.gecco.annotation.HtmlField;
import com.geccocrawler.gecco.annotation.Request;
import com.geccocrawler.gecco.annotation.Text;
import com.geccocrawler.gecco.request.HttpGetRequest;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.spider.HtmlBean;
import java.util.List;
/**
* Hello world!
*
*/
@Gecco(matchUrl = "http://kaoshi.edu.sina.com.cn/college/scorelist?tab=&wl=&local=7&provid=&batch=&syear=&page={page}", pipelines = {"mainPipeline"})
public class App implements HtmlBean
{
@Request
private HttpRequest request;
//cssPath语法和Jquery的选择器类似,用于获取mathurl文档流中符合的元素数据
// //NewsSummaryView类看下面,这个类用于封装解析后的列表页数据
@HtmlField(cssPath = ".newslist")
private List<school> newsSummaryViews;
//Text代表解析text文档(html代表解析html文档,attr代表解析标签属性等)
@Text
@HtmlField(cssPath = ".selected > a")
private String nextPage;
public List<school> getNewsSummaryViews() {
return newsSummaryViews;
}
public void setNewsSummaryViews(List<school> newsSummaryViews) {
this.newsSummaryViews = newsSummaryViews;
}
public HttpRequest getRequest() {
return request;
}
public void setRequest(HttpRequest request) {
this.request = request;
}
public String getNextPage() {
return nextPage;
}
public void setNextPage(String nextPage) {
this.nextPage = nextPage;
}
public static void main(String[] args ) {
System.out.println("=======start========");
HttpGetRequest startUrl = new HttpGetRequest("http://kaoshi.edu.sina.com.cn/college/scorelist?tab=&wl=&local=7&provid=&batch=&syear=&page=1");
startUrl.setCharset("GBK");
GeccoEngine.create()
//Gecco搜索的包路径
.classpath("pGecco")
//开始抓取的页面地址
.start(startUrl)
//开启几个爬虫线程
.thread(1)
//单个爬虫每次抓取完一个请求后的间隔时间
.interval(2000)
.run();
}
}
mainPipeline.java
package Gecco;
import com.geccocrawler.gecco.annotation.PipelineName;
import com.geccocrawler.gecco.pipeline.Pipeline;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.scheduler.SchedulerContext;
import java.util.List;
@PipelineName("mainPipeline")
public class mainPipeline implements Pipeline<App> {
@Override
public void process(App newsSpiderEntry) {
HttpRequest request = newsSpiderEntry.getRequest();
List<school> newsSummaryViews = newsSpiderEntry.getNewsSummaryViews();
for(school newsSummaryView : newsSummaryViews) {
System.out.println(newsSummaryView);
}
//获取下一页url
int nextPage = Integer.parseInt(newsSpiderEntry.getNextPage()) + 1;
//将下一页的url加入待解析队列
String nextPageurl = "http://kaoshi.edu.sina.com.cn/college/scorelist?tab=&wl=&local=7&provid=&batch=&syear=&page=1" + nextPage;
SchedulerContext.into(request.subRequest(nextPageurl));
}
}
school.java
package Gecco;
import com.geccocrawler.gecco.annotation.*;
import com.geccocrawler.gecco.spider.HtmlBean;
@Gecco(matchUrl = "http://kaoshi.edu.sina.com.cn/college/scorelist?tab=&wl=&local=7&provid=&batch=&syear=&page={page}", pipelines = {"mainPipeline"})
public class school implements HtmlBean {
private int id;
@Text
@HtmlField(cssPath = "#score > div.tabsContainer > table > tbody > tr:nth-child(2) > td:nth-child(2)")
private String student_adress;
@Text
@HtmlField(cssPath = "#score > div.tabsContainer > table > tbody > tr:nth-child(2) > td:nth-child(3)")
private String student_nature;
@Attr("data-original")
@HtmlField(cssPath = "#score > div.tabsContainer > table > tbody > tr:nth-child(2) > td:nth-child(4)")
private String student_pici;
@HtmlField(cssPath = "#score > div.tabsContainer > table > tbody > tr:nth-child(2) > td:nth-child(4)")
private String year;
@Text
@HtmlField(cssPath = "#score > div.tabsContainer > table > tbody > tr:nth-child(2) > td:nth-child(5)")
private String max_grade;
@Text
@HtmlField(cssPath = "#score > div.tabsContainer > table > tbody > tr:nth-child(2) > td:nth-child(6)")
private String min_grade;
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getStudent_adress() {
return student_adress;
}
public void setStudent_adress(String student_adress) {
this.student_adress = student_adress;
}
public String getStudent_nature() {
return student_nature;
}
public void setStudent_nature(String student_nature) {
this.student_nature = student_nature;
}
public String getStudent_pici() {
return student_pici;
}
public void setStudent_pici(String student_pici) {
this.student_pici = student_pici;
}
public String getYear() {
return year;
}
public void setYear(String year) {
this.year = year;
}
public String getMax_grade() {
return max_grade;
}
public void setMax_grade(String max_grade) {
this.max_grade = max_grade;
}
public String getMin_grade() {
return min_grade;
}
public void setMin_grade(String min_grade) {
this.min_grade = min_grade;
}
@Override
public String toString() {
return "school{" +
"id=" + id +
", student_adress='" + student_adress + '\'' +
", student_nature='" + student_nature + '\'' +
", student_pici='" + student_pici + '\'' +
", year='" + year + '\'' +
", max_grade='" + max_grade + '\'' +
", min_grade='" + min_grade + '\'' +
'}';
}
}