package com.sg123.nutch.plugin.parse.html;
import java.util.Enumeration;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;
import org.w3c.dom.DocumentFragment;
/**
* 提取name=“description”的Meta标签的“content”属性值
* @author lvshow
*
*/
public class Description implements HtmlParseFilter {//实现HtmlParseFilter接口
@Override
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
String description = null;
Properties generalMetaTags = metaTags.getGeneralTags();
for (Enumeration tagNames = generalMetaTags.propertyNames(); tagNames
.hasMoreElements();) {
if (tagNames.nextElement().toString().equalsIgnoreCase("description")) {
description = generalMetaTags.getProperty("description");
if(description==null){
description = generalMetaTags.getProperty("Description");
}
System.out.println("找到了 " + description);
}
}
if (description == null) {
System.out.println("没有description标签");
} else {
System.out.println("添加description! " + description);
//parseResult中的Map的key为Url,content的getUrl方法可以得到当前的url
parseResult.get(content.getUrl()).getData().getParseMeta().set(
"description", description);
}
return parseResult;
}
@Override
public Configuration getConf() {
// TODO Auto-generated method stub
return null;
}
@Override
public void setConf(Configuration conf) {
// TODO Auto-generated method stub
}
}
以上为插件类
plugin.xml如下:
<?xml version="1.0" encoding="UTF-8"?>
<plugin
id="description"
name="description Parser/Filter"
version="1.0.0"
provider-name="nutch.org">
<runtime>
<library name="description.jar">
<export name="*"/>
</library>
</runtime>
<extension id="com.sg123.nutch.plugin.parse.html.descriptionfilter"
name="description Parser"
point="org.apache.nutch.parse.HtmlParseFilter">
<implementation id="Description"
class="com.sg123.nutch.plugin.parse.html.Description">
</implementation>
</extension>
</plugin>
把插件类编译打包
然后放入plugins目录