public class ContentFilter implements IndexingFilter {
private Configuration conf;
public ContentFilter() {
}
/***
* 过滤包含过滤词的网页,如果包含过滤词,则返回null
*/
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
String content = parse.getText();//取得网页内容
String title = parse.getData().getTitle();//取得网页标题
List<String> filterWords = FilterWord.getFilterWords();//静态变量存储过滤词
for(int i = 0; i < filterWords.size(); i++ ){
if (content.contains(filterWords.get(i))) {
doc = null;
}
if (title.contains(filterWords.get(i))) {
doc = null;
}
}
return doc;
}
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return this.conf;
}
public void addIndexBackendOptions(Configuration conf) {
this.conf = conf;
}
}
Nutch内容过滤的实现[转]
最新推荐文章于 2013-09-18 14:10:00 发布