crawler4j 爬爬知多少

最新推荐文章于 2016-12-19 11:39:00 发布

weixin_33957648

最新推荐文章于 2016-12-19 11:39:00 发布

阅读量114

点赞数

文章标签： java 爬虫

本文链接：https://blog.csdn.net/weixin_33957648/article/details/85124947

版权

　　1. Crawler是什么？

　　crawler4j是一个开源的java爬虫类库，可以用来构建多线程的web爬虫来抓取页面内容。

　　2. 如何获取Crawler？

　　crawler4j的官方地址在这里，目前版本为4.1。如果你使用Maven，可以通过下面的pom的方式，如直接下载，点击这里。

　　3. Crawler怎么用？

　　crawler4j的使用分为两个步骤：一是实现一个继承自edu.uci.ics.crawler4j.crawler.WebCrawler的爬虫类；另外就是通过CrawController调用实现的爬虫类。

package com.favccxx.favsoft.favcrawler;

import java.util.Set;
import java.util.regex.Pattern;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.url.WebURL;

public class FavWebCrawler extends WebCrawler {

	private static final Logger logger = LoggerFactory.getLogger(WebCrawler.class);

	private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg" + "|png|mp3|mp3|zip|gz))$");

	@Override
	public boolean shouldVisit(Page referringPage, WebURL url) {
		String href = url.getURL().toLowerCase();
		return !FILTERS.matcher(href).matches() && href.startsWith("http://www.oschina.net/");
	}

	/**
	 * 处理抓取到的页面时，调用该方法	 
	 */
	@Override
	public void visit(Page page) {
		int docid = page.getWebURL().getDocid();
		String url = page.getWebURL().getURL();
		String domain = page.getWebURL().getDomain();
		String path = page.getWebURL().getPath();
		String subDomain = page.getWebURL().getSubDomain();
		String parentUrl = page.getWebURL().getParentUrl();
		String anchor = page.getWebURL().getAnchor();

		logger.debug("Docid: {}", docid);
		logger.info("URL: {}", url);
		logger.debug("Domain: '{}'", domain);
		logger.debug("Sub-domain: '{}'", subDomain);
		logger.debug("Path: '{}'", path);
		logger.debug("Parent page: {}", parentUrl);
		logger.debug("Anchor text: {}", anchor);

		if (page.getParseData() instanceof HtmlParseData) {
			HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
			String text = htmlParseData.getText();
			String html = htmlParseData.getHtml();
			Set<WebURL> links = htmlParseData.getOutgoingUrls();
			logger.debug("Text length: " + text.length());
			logger.debug("Html length: " + html.length());
			logger.debug("Number of outgoing links: " + links.size());
		}
	}
}

package com.favccxx.favsoft.favcrawler;

import java.util.Set;
import java.util.regex.Pattern;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import edu.uci.ics.crawler4j.url.WebURL;

public class MyCrawler extends WebCrawler {
	
	private static final Logger logger = LoggerFactory.getLogger(WebCrawler.class);
	
	private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg"  + "|png|mp3|mp3|zip|gz))$");
	
	 @Override
     public boolean shouldVisit(Page referringPage, WebURL url) {
         String href = url.getURL().toLowerCase();
         return !FILTERS.matcher(href).matches()
                && href.startsWith("http://www.oschina.net/");
     }

     /**
      * This function is called when a page is fetched and ready
      * to be processed by your program.
      */
     @Override
     public void visit(Page page) {
    	 
    	 int docid = page.getWebURL().getDocid();
 	    String url = page.getWebURL().getURL();
 	    String domain = page.getWebURL().getDomain();
 	    String path = page.getWebURL().getPath();
 	    String subDomain = page.getWebURL().getSubDomain();
 	    String parentUrl = page.getWebURL().getParentUrl();
 	    String anchor = page.getWebURL().getAnchor();
// 	   page.getWebURL().getTag()
 	    
 	    System.out.println("********************************");
// 	    
// 	    System.out.println("Docid: {}" + docid);
// 	    System.out.println("URL: {}"+ url);
// 	    System.out.println("Domain: '{}'"+ domain);
// 	   	System.out.println("Sub-domain: '{}'"+ subDomain);
// 	  	System.out.println("Path: '{}'"+ path);
// 	  	System.out.println("Parent page: {}"+ parentUrl);
// 		System.out.println("Anchor text: {}"+ anchor);

 	    logger.debug("Docid: {}", docid);
 	    logger.info("URL: {}", url);
 	    logger.debug("Domain: '{}'", domain);
 	    logger.debug("Sub-domain: '{}'", subDomain);
 	    logger.debug("Path: '{}'", path);
 	    logger.debug("Parent page: {}", parentUrl);
 	    logger.debug("Anchor text: {}", anchor);
    	 
    	 
//         String url = page.getWebURL().getURL();
         System.out.println("URL: " + url);

         if (page.getParseData() instanceof HtmlParseData) {
             HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
             String text = htmlParseData.getText();
             String html = htmlParseData.getHtml();
             Set<WebURL> links = htmlParseData.getOutgoingUrls();
             
             System.out.println("--------------------------");
//             System.out.println(text);
             System.out.println("--------------------------");
             System.out.println("Text length: " + text.length());
             System.out.println("Html length: " + html.length());
             System.out.println("Number of outgoing links: " + links.size());
         }
    }
     
    public static void main(String[] args) throws Exception{
    	 String crawlStorageFolder = "/data/crawl/root";
         int numberOfCrawlers = 7;

         CrawlConfig config = new CrawlConfig();
         config.setCrawlStorageFolder(crawlStorageFolder);

         /*
          * Instantiate the controller for this crawl.
          */
         PageFetcher pageFetcher = new PageFetcher(config);
         RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
         RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
         CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);

         /*
          * For each crawl, you need to add some seed urls. These are the first
          * URLs that are fetched and then the crawler starts following links
          * which are found in these pages
          */
         controller.addSeed("http://www.oschina.net/");
//         controller.addSeed("http://www.ics.uci.edu/~welling/");
//         controller.addSeed("http://www.ics.uci.edu/");

         /*
          * Start the crawl. This is a blocking operation, meaning that your code
          * will reach the line after this only when crawling is finished.
          */
         controller.start(MyCrawler.class, numberOfCrawlers);
    }

}

　　4. Crawler常用配置

　　crawler4j的配置文件都位于edu.uci.ics.crawler4j.crawler.CrawlConfig中，各配置属性的详细说明如下。

crawlStorageFolder：临时存储抓取来的文件的地方，相当于文件中转站。

resumableCrawling：是否重新抓取上一个异常停止/损坏的文件的开关，默认不开启。如果开启该开关，毫无疑问会降低抓取的效率。

maxDepthOfCrawling：抓取的最大深度。默认为-1，即无限深度。

maxPagesToFetch：抓取的最大页面数。默认为-1，即无限抓取。

userAgentString：抓取web服务器的用户代理。默认为“crawler4j (http://code.google.com/p/crawler4j/)”。

politenessDelay：（同一主机的两个请求间的）延迟毫秒数。默认为200。

includeHttpsPages：是否包含Https页面。默认包含。

includeBinaryContentInCrawling：是否包含二进制文件，如image，audio等。默认为不抓取。

maxConnectionsPerHost：每个主机的最大连接数，默认为100。

maxTotalConnections：主机的总共连接数，默认为100。

socketTimeout：socket超时毫秒数，默认为20000。

connectionTimeout：连接超时毫秒数，默认为30000。

maxOutgoingLinksToFollow：每个页面的最大外链数，默认为5000。

maxDownloadSize：每个页面的最大下载容量，默认1048576kb（1024M），超过的部分不会下载。

followRedirects：是否抓取重定向的页面，默认抓取。

proxyHost：代理主机地址，仅在使用代理上网时使用。

proxyPort：代理端口号。

proxyUsername：代理用户名。

proxyPassword：代理密码。

authInfos：授权用户信息。

weixin_33957648

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
crawler4j 爬爬知多少

　　1. Crawler是什么？　　crawler4j是一个开源的java爬虫类库，可以用来构建多线程的web爬虫来抓取页面内容。　　2. 如何获取Crawler？　　crawler4j的官方地址在这里，目前版本为4.1。如果你使用Maven，可以通过下面的pom的方式，如直接下载，点击这里。　　3. Crawler怎么用？　　crawler4j的使用分为两个步骤：一是实现一个继承自edu.uci...
复制链接

扫一扫