webController基本使用

package com.example.swagger;


import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.rocks.BreadthCrawler;
import java.io.*;

public class RediffNewCrawler extends BreadthCrawler {
    private static StringBuilder sb = new StringBuilder();
    private static String fileName;
    private static String code;

    public static void main(String[] args) throws Exception {
        RediffNewCrawler crawler = new RediffNewCrawler
                ("rediffNewCrawler",
                        true, "C:/Users/brukl/Desktop/rediffNews.txt", "utf-8");
        //设置线程数目
        crawler.setThreads(5);
        //设置每一层最多采集的页面
        crawler.getConf().setTopN(300);
        //开始采集数据,设置采集深度
        crawler.start(3);
    }

    public RediffNewCrawler(java.lang.String crawlPath, boolean autoParse,String filename,String cod) {
        super(crawlPath, autoParse);
        /*添加地址*/
        this.addSeed("https://www.bzu.edu.cn/xyxw/list.htm");
        //添加规则
        this.addRegex("https://www.bzu.edu.cn/\\d+/\\d+/\\w+/.*htm");
        this.addRegex("-.*\\.(jpg|png|gif).*");
        /*
        输出配置,文件名,编码
         */
        fileName=filename;
        code = cod;
    }

    @Override
    public void visit(Page page, CrawlDatums crawlDatums) {
    String url = page.url();
    //不合格的地址在这里过滤掉
        if (page.matchUrl("https://www.bzu.edu.cn/\\d+/\\d+/\\w+/.*htm")){
            //使用jsoup解析数据
            String title = page.select(".biaoti.biaoti-line").text();
            String content = page.select(".wp_articlecontent p").text();
            sb.append("URL:\t"+url+"\n"+"title:\t"+title
                        +"\ncontent:\t"+content+"\n\n");
        }
        try{

            writeFile(fileName,sb.toString(),code);
        }catch(Exception e){
            e.printStackTrace();
        }
    }

    public static void writeFile(String file, String content, String code)throws IOException {
        File result = new File(file);
        FileOutputStream out = new FileOutputStream(result, false);
        BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(out, code));
        bw.write(content);
        bw.close();
        out.close();
    }


}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值