package com.example.swagger;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.rocks.BreadthCrawler;
import java.io.*;
public class RediffNewCrawler extends BreadthCrawler {
private static StringBuilder sb = new StringBuilder();
private static String fileName;
private static String code;
public static void main(String[] args) throws Exception {
RediffNewCrawler crawler = new RediffNewCrawler
("rediffNewCrawler",
true, "C:/Users/brukl/Desktop/rediffNews.txt", "utf-8");
//设置线程数目
crawler.setThreads(5);
//设置每一层最多采集的页面
crawler.getConf().setTopN(300);
//开始采集数据,设置采集深度
crawler.start(3);
}
public RediffNewCrawler(java.lang.String crawlPath, boolean autoParse,String filename,String cod) {
super(crawlPath, autoParse);
/*添加地址*/
this.addSeed("https://www.bzu.edu.cn/xyxw/list.htm");
//添加规则
this.addRegex("https://www.bzu.edu.cn/\\d+/\\d+/\\w+/.*htm");
this.addRegex("-.*\\.(jpg|png|gif).*");
/*
输出配置,文件名,编码
*/
fileName=filename;
code = cod;
}
@Override
public void visit(Page page, CrawlDatums crawlDatums) {
String url = page.url();
//不合格的地址在这里过滤掉
if (page.matchUrl("https://www.bzu.edu.cn/\\d+/\\d+/\\w+/.*htm")){
//使用jsoup解析数据
String title = page.select(".biaoti.biaoti-line").text();
String content = page.select(".wp_articlecontent p").text();
sb.append("URL:\t"+url+"\n"+"title:\t"+title
+"\ncontent:\t"+content+"\n\n");
}
try{
writeFile(fileName,sb.toString(),code);
}catch(Exception e){
e.printStackTrace();
}
}
public static void writeFile(String file, String content, String code)throws IOException {
File result = new File(file);
FileOutputStream out = new FileOutputStream(result, false);
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(out, code));
bw.write(content);
bw.close();
out.close();
}
}
webController基本使用
最新推荐文章于 2024-09-13 22:02:36 发布