WebCollector入门教程

最新推荐文章于 2021-03-16 02:09:56 发布

雯雯要加油

最新推荐文章于 2021-03-16 02:09:56 发布

阅读量3.1k

点赞数 1

分类专栏：爬虫

爬虫专栏收录该内容

4 篇文章 0 订阅

订阅专栏

1.将WebCollector导入工程：

进入WebCollector主页：https://github.com/CrawlScript/WebCollector

下载：webcollector-版本号-bin.zip

将解压后文件夹中的所有jar包添加到工程既可。

2.用WebCollector爬取整个网站：

爬取新华网整站内容：

[java]view plaincopy 
   
 public class Demo {  
   
     public static void main(String[] args) throws IOException {  
         BreadthCrawler crawler = new BreadthCrawler();  
         crawler.addSeed("http://www.xinhuanet.com/");    
         crawler.addRegex("http://www.xinhuanet.com/.*");       
         /*网页、图片、文件被存储在download文件夹中*/  
         crawler.setRoot("download");  
         /*进行深度为5的爬取*/  
         crawler.start(5);  
     }  
 }  

3.用WebCollector进行精准抽取：

爬取《知乎》并进行问题精准抽取的爬虫（JAVA）:

[java]view plaincopy 
   
 public class ZhihuCrawler extends BreadthCrawler{  
   
     /*visit函数定制访问每个页面时所需进行的操作*/  
     @Override  
     public void visit(Page page) {  
         String question_regex="^http://www.zhihu.com/question/[0-9]+";  
         if(Pattern.matches(question_regex, page.getUrl())){  
             System.out.println("正在抽取"+page.getUrl());  
             /*抽取标题*/  
             String title=page.getDoc().title();  
             System.out.println(title);  
             /*抽取提问内容*/  
             String question=page.getDoc().select("div[id=zh-question-detail]").text();  
             System.out.println(question);  
   
         }  
     }  
   
     /*启动爬虫*/  
     public static void main(String[] args) throws IOException{    
         ZhihuCrawler crawler=new ZhihuCrawler();  
         crawler.addSeed("http://www.zhihu.com/question/21003086");  
         crawler.addRegex("http://www.zhihu.com/.*");  
         crawler.start(5);    
     }  
   
   
 }  

4.用WebCollector爬取指定URL列表的网页（不需要递归爬取）。

[java]view plaincopy 
   
 public class Demo2 {  
   
     public static void main(String[] args) throws IOException {  
         /*设置递归爬取时每个页面产生的URL数量，这里不需要递归爬取*/  
         Config.topN=0;  
         BreadthCrawler crawler = new BreadthCrawler();  
         crawler.addSeed("http://www.xinhuanet.com/");     
         crawler.addSeed("http://www.sina.com.cn/");  
         crawler.addRegex(".*");   
         /*网页、图片、文件被存储在download文件夹中*/  
         crawler.setRoot("download");  
         /*进行深度为1的爬取*/  
         crawler.start(1);  
     }  
 }  

5.用WebCollector爬取站内以及外站内容：

爬取新华网以及新华网内所有外链的内容，以及外链的外链.......

[java]view plaincopy 
   
 public class Demo3 {  
   
     public static void main(String[] args) throws IOException {  
         BreadthCrawler crawler = new BreadthCrawler();  
         crawler.addSeed("http://www.xinhuanet.com/");         
         /*网页、图片、文件被存储在download文件夹中*/  
         crawler.setRoot("download");  
         /*指定对爬取URL的限制（URL正则)*/  
         crawler.addRegex(".*");  
         /*进行深度为5的爬取*/  
         crawler.start(5);  
     }  
 }  

6.高级参数配置：

[java]view plaincopy 
   
 public class Demo4 {  
   
     public static void main(String[] args) throws IOException {  
         BreadthCrawler crawler = new BreadthCrawler();  
         crawler.addSeed("http://www.xinhuanet.com/");     
           
         /*URL信息存放路径*/  
         crawler.setCrawl_path("crawl");  
           
         /*网页、图片、文件被存储在download文件夹中*/  
         crawler.setRoot("download");  
           
         /*正规则，待爬取网页至少符合一条正规则，才可以爬取*/  
         crawler.addRegex("+^http://www.xinhuanet.com/");          
         crawler.addRegex("+^http://news.xinhuanet.com.*");  
           
         /*负规则，只要符合一条负规则，跳过，不爬取*/  
         crawler.addRegex("-^http://news.xinhuanet.com/edu.*");  
           
         /*线程数*/  
         crawler.setThreads(30);  
           
         /*设置User-Agent*/  
         crawler.setUseragent("Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:26.0) Gecko/20100101 Firefox/26.0");  
           
         /*设置cookie*/  
         crawler.setCookie("你的Cookie");  
           
         /*设置是否支持断点爬取*/  
         crawler.setResumable(false);  
           
         /*进行深度为5的爬取*/  
         crawler.start(5);  
     }  
 }