Robots.txt是一种用于限制网络爬虫的文件,如果在构建网站时,在站点内放置一个Robots.txt文件,在其中可以声明不希望搜索引擎访问的部分。然而,这也是Heritrix爬虫在抓取网页时花费过多的时间去判断该Robots.txt文件是否存在。。。好在这个协议本身是一种附加协议,完全可以不遵守。
在Heritrix的org.archive.crawler.prefetch.PreconditionEnforcer类中定义了获取Robots.txt的方法,我的选择是无论Robots.txt是否存在,都返回不存在,修改方法如下
private boolean considerRobotsPreconditions(CrawlURI curi) {
//此处为提高抓取效率,将忽略Robots.txt协议
return false;
// treat /robots.txt fetches specially
/*UURI uuri = curi.getUURI();
try {
if (uuri != null && uuri.getPath() != null &&
curi.getUURI().getPath().equals("/robots.txt")) {
// allow processing to continue
curi.setPrerequisite(true);
return false;
}
}
catch (URIException e) {
logger.severe("Failed get of path for " + curi);
}
// require /robots.txt if not present
if (isRobotsExpired(curi)) {
// Need to get robots
if (logger.isLoggable(Level.FINE)) {
logger.fine( "No valid robots for " +
getController().getServerCache().getServerFor(curi) +
"; deferring " + curi);
}
// Robots expired - should be refetched even though its already
// crawled.
try {
String prereq = curi.getUURI().resolve("/robots.txt").toString();
curi.markPrerequisite(prereq,
getController().getPostprocessorChain());
}
catch (URIException e1) {
logger.severe("Failed resolve using " + curi);
throw new RuntimeException(e1); // shouldn't ever happen
}
return true;
}
// test against robots.txt if available
CrawlServer cs = getController().getServerCache().getServerFor(curi);
if(cs.isValidRobots()){
String ua = getController().getOrder().getUserAgent(curi);
if(cs.getRobots().disallows(curi, ua)) {
if(((Boolean)getUncheckedAttribute(curi,ATTR_CALCULATE_ROBOTS_ONLY)).booleanValue() == true) {
// annotate URI as excluded, but continue to process normally
curi.addAnnotation("robotExcluded");
return false;
}
// mark as precluded; in FetchHTTP, this will
// prevent fetching and cause a skip to the end
// of processing (unless an intervening processor
// overrules)
curi.setFetchStatus(S_ROBOTS_PRECLUDED);
curi.putString("error","robots.txt exclusion");
logger.fine("robots.txt precluded " + curi);
return true;
}
return false;
}
// No valid robots found => Attempt to get robots.txt failed
curi.skipToProcessorChain(getController().getPostprocessorChain());
curi.setFetchStatus(S_ROBOTS_PREREQUISITE_FAILURE);
curi.putString("error","robots.txt prerequisite failed");
if (logger.isLoggable(Level.FINE)) {
logger.fine("robots.txt prerequisite failed " + curi);
}*/
//return true;
}