package org.apache.nutch.htmlfilter.my; import java.util.regex.*; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.crawl.Crawl; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.parse.HTMLMetaTags; import org.apache.nutch.parse.HtmlParseFilter; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseResult; import org.apache.nutch.protocol.Content; import org.w3c.dom.DocumentFragment; public class MyHtmlParseFilter implements HtmlParseFilter { public static final Log LOG = LogFactory.getLog(MyHtmlParseFilter.class); private Configuration conf; private Pattern p_p_title = Pattern .compile("<span .+class=\"b14c\">(.*?)</span>"); private Pattern p_p_article = Pattern .compile("<td .*class=\"h14\".*>([\\s\\S]+?)</td>"); private Pattern p_p_pubdate = Pattern .compile("<font class=\"h12\">发布时间:(.*)</font>"); public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) { Parse parse = parseResult.get(content.getUrl()); Metadata md = parse.getData().getParseMeta(); try { // 抽取字段 正文信息示例 String html = new String(content.getContent()); String title = extract(html, p_p_title); String article = extract(html, p_p_article); String site = "中国公路信息网|行业动态|新通车信息"; String pubdate_1 = extract(html, p_p_pubdate); String pubdate = pubdate_1.replace('年', '-').replace('月', '-') .replace("日", ""); String refurl = null; String cate = "1234567"; md.add("p_title", title); md.add("p_article", article); md.add("p_site", site); md.add("p_pubdate", pubdate); md.add("p_refurl", refurl); md.add("p_cate", cate); } catch (Exception e) { LOG.info(e.getMessage()); } return parseResult; } private String extract(String html, Pattern p) { Matcher match = p.matcher(html); String val = null; while (match.find()) { val = match.group(1); if (val != null) { val = val.trim(); } } return val; } public Configuration getConf() { return this.conf; } public void setConf(Configuration conf) { this.conf = conf; } }
nutch从网页中提取字段并索引_HtmlParseFilter
最新推荐文章于 2017-11-05 10:20:46 发布