1. Crawler是什么?
crawler4j是一个开源的java爬虫类库,可以用来构建多线程的web爬虫来抓取页面内容。
2. 如何获取Crawler?
crawler4j的官方地址在这里,目前版本为4.1。如果你使用Maven,可以通过下面的pom的方式,如直接下载,点击这里。
3. Crawler怎么用?
crawler4j的使用分为两个步骤:一是实现一个继承自edu.uci.ics.crawler4j.crawler.WebCrawler的爬虫类;另外就是通过CrawController调用实现的爬虫类。
package com.favccxx.favsoft.favcrawler;
import java.util.Set;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.url.WebURL;
public class FavWebCrawler extends WebCrawler {
private static final Logger logger = LoggerFactory.getLogger(WebCrawler.class);
private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg" + "|png|mp3|mp3|zip|gz))$");
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
String href = url.getURL().toLowerCase();
return !FILTERS.matcher(href).matches() && href.startsWith("http://www.oschina.net/");
}
/**
* 处理抓取到的页面时,调用该方法
*/
@Override
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
String domain = page.getWebURL().getDomain();
String path = page.getWebURL().getPath();
String subDomain = page.getWebURL().getSubDomain();
String parentUrl = page.getWebURL().getParentUrl();
String anchor = page.getWebURL().getAnchor();
logger.debug("Docid: {}", docid);
logger.info("URL: {}", url);
logger.debug("Domain: '{}'", domain);
logger.debug("Sub-domain: '{}'", subDomain);
logger.debug("Path: '{}'", path);
logger.debug("Parent page: {}", parentUrl);
logger.debug("Anchor text: {}", anchor);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
logger.debug("Text length: " + text.length());
logger.debug("Html length: " + html.length());
logger.debug("Number of outgoing links: " + links.size());
}
}
}
package com.favccxx.favsoft.favcrawler;
import java.util.Set;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import edu.uci.ics.crawler4j.url.WebURL;
public class MyCrawler extends WebCrawler {
private static final Logger logger = LoggerFactory.getLogger(WebCrawler.class);
private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg" + "|png|mp3|mp3|zip|gz))$");
@Override
public boolean shouldVisit(Page referringPage, WebURL url) {
String href = url.getURL().toLowerCase();
return !FILTERS.matcher(href).matches()
&& href.startsWith("http://www.oschina.net/");
}
/**
* This function is called when a page is fetched and ready
* to be processed by your program.
*/
@Override
public void visit(Page page) {
int docid = page.getWebURL().getDocid();
String url = page.getWebURL().getURL();
String domain = page.getWebURL().getDomain();
String path = page.getWebURL().getPath();
String subDomain = page.getWebURL().getSubDomain();
String parentUrl = page.getWebURL().getParentUrl();
String anchor = page.getWebURL().getAnchor();
// page.getWebURL().getTag()
System.out.println("********************************");
//
// System.out.println("Docid: {}" + docid);
// System.out.println("URL: {}"+ url);
// System.out.println("Domain: '{}'"+ domain);
// System.out.println("Sub-domain: '{}'"+ subDomain);
// System.out.println("Path: '{}'"+ path);
// System.out.println("Parent page: {}"+ parentUrl);
// System.out.println("Anchor text: {}"+ anchor);
logger.debug("Docid: {}", docid);
logger.info("URL: {}", url);
logger.debug("Domain: '{}'", domain);
logger.debug("Sub-domain: '{}'", subDomain);
logger.debug("Path: '{}'", path);
logger.debug("Parent page: {}", parentUrl);
logger.debug("Anchor text: {}", anchor);
// String url = page.getWebURL().getURL();
System.out.println("URL: " + url);
if (page.getParseData() instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
String text = htmlParseData.getText();
String html = htmlParseData.getHtml();
Set<WebURL> links = htmlParseData.getOutgoingUrls();
System.out.println("--------------------------");
// System.out.println(text);
System.out.println("--------------------------");
System.out.println("Text length: " + text.length());
System.out.println("Html length: " + html.length());
System.out.println("Number of outgoing links: " + links.size());
}
}
public static void main(String[] args) throws Exception{
String crawlStorageFolder = "/data/crawl/root";
int numberOfCrawlers = 7;
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
/*
* Instantiate the controller for this crawl.
*/
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* For each crawl, you need to add some seed urls. These are the first
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("http://www.oschina.net/");
// controller.addSeed("http://www.ics.uci.edu/~welling/");
// controller.addSeed("http://www.ics.uci.edu/");
/*
* Start the crawl. This is a blocking operation, meaning that your code
* will reach the line after this only when crawling is finished.
*/
controller.start(MyCrawler.class, numberOfCrawlers);
}
}
4. Crawler常用配置
crawler4j的配置文件都位于edu.uci.ics.crawler4j.crawler.CrawlConfig中,各配置属性的详细说明如下。
crawlStorageFolder:临时存储抓取来的文件的地方,相当于文件中转站。 resumableCrawling:是否重新抓取上一个异常停止/损坏的文件的开关,默认不开启。如果开启该开关,毫无疑问会降低抓取的效率。 maxDepthOfCrawling:抓取的最大深度。默认为-1,即无限深度。 maxPagesToFetch:抓取的最大页面数。默认为-1,即无限抓取。 userAgentString:抓取web服务器的用户代理。默认为“crawler4j (http://code.google.com/p/crawler4j/)”。 politenessDelay:(同一主机的两个请求间的)延迟毫秒数。默认为200。 includeHttpsPages:是否包含Https页面。默认包含。 includeBinaryContentInCrawling:是否包含二进制文件,如image,audio等。默认为不抓取。 maxConnectionsPerHost:每个主机的最大连接数,默认为100。 maxTotalConnections:主机的总共连接数,默认为100。 socketTimeout:socket超时毫秒数,默认为20000。 connectionTimeout:连接超时毫秒数,默认为30000。 maxOutgoingLinksToFollow:每个页面的最大外链数,默认为5000。 maxDownloadSize:每个页面的最大下载容量,默认1048576kb(1024M),超过的部分不会下载。 followRedirects:是否抓取重定向的页面,默认抓取。 proxyHost:代理主机地址,仅在使用代理上网时使用。 proxyPort:代理端口号。 proxyUsername:代理用户名。 proxyPassword:代理密码。 authInfos:授权用户信息。 |