- package com.sg123.nutch.plugin.parse.html;
- import java.util.Enumeration;
- import java.util.Properties;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.nutch.parse.HTMLMetaTags;
- import org.apache.nutch.parse.HtmlParseFilter;
- import org.apache.nutch.parse.ParseResult;
- import org.apache.nutch.protocol.Content;
- import org.w3c.dom.DocumentFragment;
- /**
- * 提取name=“description”的Meta标签的“content”属性值
- * @author lvshow
- *
- */
- public class Description implements HtmlParseFilter {//实现HtmlParseFilter接口
- @Override
- public ParseResult filter(Content content, ParseResult parseResult,
- HTMLMetaTags metaTags, DocumentFragment doc) {
- String description = null;
- Properties generalMetaTags = metaTags.getGeneralTags();
- for (Enumeration tagNames = generalMetaTags.propertyNames(); tagNames
- .hasMoreElements();) {
- if (tagNames.nextElement().toString().equalsIgnoreCase("description")) {
- description = generalMetaTags.getProperty("description");
- if(description==null){
- description = generalMetaTags.getProperty("Description");
- }
- System.out.println("找到了 " + description);
- }
- }
- if (description == null) {
- System.out.println("没有description标签");
- } else {
- System.out.println("添加description! " + description);
- //parseResult中的Map的key为Url,content的getUrl方法可以得到当前的url
- parseResult.get(content.getUrl()).getData().getParseMeta().set(
- "description", description);
- }
- return parseResult;
- }
- @Override
- public Configuration getConf() {
- // TODO Auto-generated method stub
- return null;
- }
- @Override
- public void setConf(Configuration conf) {
- // TODO Auto-generated method stub
- }
- }
以上为插件类
plugin.xml如下:
- <?xml version="1.0" encoding="UTF-8"?>
- <plugin
- id="description"
- name="description Parser/Filter"
- version="1.0.0"
- provider-name="nutch.org">
- <runtime>
- <library name="description.jar">
- <export name="*"/>
- </library>
- </runtime>
- <extension id="com.sg123.nutch.plugin.parse.html.descriptionfilter"
- name="description Parser"
- point="org.apache.nutch.parse.HtmlParseFilter">
- <implementation id="Description"
- class="com.sg123.nutch.plugin.parse.html.Description">
- </implementation>
- </extension>
- </plugin>
把插件类编译打包
然后放入plugins目录