App

APP.java

 package Gecco;
    import com.geccocrawler.gecco.GeccoEngine;
    import com.geccocrawler.gecco.annotation.Gecco;
    import com.geccocrawler.gecco.annotation.HtmlField;
    import com.geccocrawler.gecco.annotation.Request;
    import com.geccocrawler.gecco.annotation.Text;
    import com.geccocrawler.gecco.request.HttpGetRequest;
    import com.geccocrawler.gecco.request.HttpRequest;
    import com.geccocrawler.gecco.spider.HtmlBean;
    
    import java.util.List;
    
    /**
 * Hello world!
 *
 */
@Gecco(matchUrl = "http://kaoshi.edu.sina.com.cn/college/scorelist?tab=&wl=&local=7&provid=&batch=&syear=&page={page}", pipelines = {"mainPipeline"})
public class App implements HtmlBean
{

    @Request
    private HttpRequest request;

    //cssPath语法和Jquery的选择器类似,用于获取mathurl文档流中符合的元素数据
    //  //NewsSummaryView类看下面,这个类用于封装解析后的列表页数据
    @HtmlField(cssPath = ".newslist")
    private List<school> newsSummaryViews;

    //Text代表解析text文档(html代表解析html文档,attr代表解析标签属性等)
    @Text
    @HtmlField(cssPath = ".selected > a")
    private String nextPage;

    public List<school> getNewsSummaryViews() {
        return newsSummaryViews;
    }

    public void setNewsSummaryViews(List<school> newsSummaryViews) {
        this.newsSummaryViews = newsSummaryViews;
    }
    public HttpRequest getRequest() {
        return request;
    }

    public void setRequest(HttpRequest request) {
        this.request = request;
    }


    public String getNextPage() {
        return nextPage;
    }

    public void setNextPage(String nextPage) {
        this.nextPage = nextPage;
    }

    public static void main(String[] args ) {

            System.out.println("=======start========");
            HttpGetRequest startUrl = new HttpGetRequest("http://kaoshi.edu.sina.com.cn/college/scorelist?tab=&wl=&local=7&provid=&batch=&syear=&page=1");
            startUrl.setCharset("GBK");
            GeccoEngine.create()

                    //Gecco搜索的包路径
                    .classpath("pGecco")
                    //开始抓取的页面地址
                    .start(startUrl)
                    //开启几个爬虫线程
                    .thread(1)
                    //单个爬虫每次抓取完一个请求后的间隔时间
                    .interval(2000)
                    .run();
        }


}

mainPipeline.java

package Gecco;

import com.geccocrawler.gecco.annotation.PipelineName;
import com.geccocrawler.gecco.pipeline.Pipeline;
import com.geccocrawler.gecco.request.HttpRequest;
import com.geccocrawler.gecco.scheduler.SchedulerContext;

import java.util.List;

@PipelineName("mainPipeline")
public class mainPipeline implements Pipeline<App> {
    @Override
    public void process(App newsSpiderEntry) {
        HttpRequest request = newsSpiderEntry.getRequest();
        List<school> newsSummaryViews = newsSpiderEntry.getNewsSummaryViews();
        for(school newsSummaryView : newsSummaryViews) {
            System.out.println(newsSummaryView);
        }
        //获取下一页url
        int nextPage = Integer.parseInt(newsSpiderEntry.getNextPage()) + 1;
        //将下一页的url加入待解析队列
        String nextPageurl = "http://kaoshi.edu.sina.com.cn/college/scorelist?tab=&wl=&local=7&provid=&batch=&syear=&page=1" + nextPage;
        SchedulerContext.into(request.subRequest(nextPageurl));
    }
}

school.java

package Gecco;

import com.geccocrawler.gecco.annotation.*;
import com.geccocrawler.gecco.spider.HtmlBean;

@Gecco(matchUrl = "http://kaoshi.edu.sina.com.cn/college/scorelist?tab=&wl=&local=7&provid=&batch=&syear=&page={page}", pipelines = {"mainPipeline"})
public class school implements HtmlBean {
    private int id;


    @Text
    @HtmlField(cssPath = "#score > div.tabsContainer > table > tbody > tr:nth-child(2) > td:nth-child(2)")
    private String student_adress;

    @Text
    @HtmlField(cssPath = "#score > div.tabsContainer > table > tbody > tr:nth-child(2) > td:nth-child(3)")
    private String student_nature;


    @Attr("data-original")
    @HtmlField(cssPath = "#score > div.tabsContainer > table > tbody > tr:nth-child(2) > td:nth-child(4)")
    private String student_pici;


    @HtmlField(cssPath = "#score > div.tabsContainer > table > tbody > tr:nth-child(2) > td:nth-child(4)")
    private String year;
    @Text
    @HtmlField(cssPath = "#score > div.tabsContainer > table > tbody > tr:nth-child(2) > td:nth-child(5)")
    private String max_grade;

    @Text
    @HtmlField(cssPath = "#score > div.tabsContainer > table > tbody > tr:nth-child(2) > td:nth-child(6)")
    private String min_grade;

    public int getId() {
        return id;
    }

    public void setId(int id) {
        this.id = id;
    }

    public String getStudent_adress() {
        return student_adress;
    }

    public void setStudent_adress(String student_adress) {
        this.student_adress = student_adress;
    }

    public String getStudent_nature() {
        return student_nature;
    }

    public void setStudent_nature(String student_nature) {
        this.student_nature = student_nature;
    }

    public String getStudent_pici() {
        return student_pici;
    }

    public void setStudent_pici(String student_pici) {
        this.student_pici = student_pici;
    }

    public String getYear() {
        return year;
    }

    public void setYear(String year) {
        this.year = year;
    }

    public String getMax_grade() {
        return max_grade;
    }

    public void setMax_grade(String max_grade) {
        this.max_grade = max_grade;
    }

    public String getMin_grade() {
        return min_grade;
    }

    public void setMin_grade(String min_grade) {
        this.min_grade = min_grade;
    }

    @Override
    public String toString() {
        return "school{" +
                "id=" + id +
                ", student_adress='" + student_adress + '\'' +
                ", student_nature='" + student_nature + '\'' +
                ", student_pici='" + student_pici + '\'' +
                ", year='" + year + '\'' +
                ", max_grade='" + max_grade + '\'' +
                ", min_grade='" + min_grade + '\'' +
                '}';
    }
}

  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值