//==================================Metadata===========================
package org.apache.nutch.metadata;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
/**
* Metadata类包包含一个名叫metadata的Map
* (此类是一个Map类型容器,Map的value可存放多个值)
*/
public class Metadata implements Writable, CreativeCommons,
DublinCore, HttpHeaders, Nutch, Office, Feed {
/**
* A map of all metadata attributes.
*包含一个Map metadata,Map的value可存放多个字符串类型的值(因为是字符串数组)
*/
private Map<String, String[]> metadata = null;
/**
* Constructs a new, empty metadata.
*/
public Metadata() {
metadata = new HashMap<String, String[]>();
}
}
//==========================ParseData=======================================
package org.apache.nutch.parse;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.util.NutchConfiguration;
/** Data extracted from a page's content.
* 从网页获取并初步解析后的内容
* @see Parse#getData()
*/
public final class ParseData extends VersionedWritable {
public static final String DIR_NAME = "parse_data";
private final static byte VERSION = 5;
//从网页解析出来的内容
private String title;
private Outlink[] outlinks;
private Metadata contentMeta;//
private Metadata parseMeta;//
private ParseStatus status;
private byte version = VERSION;
public ParseData() {
contentMeta = new Metadata();
parseMeta = new Metadata();
}
public ParseData(ParseStatus status, String title, Outlink[] outlinks,
Metadata contentMeta) {
this(status, title, outlinks, contentMeta, new Metadata());
}
public ParseData(ParseStatus status, String title, Outlink[] outlinks,
Metadata contentMeta, Metadata parseMeta) {
this.status = status;
this.title = title;
this.outlinks = outlinks;
this.contentMeta = contentMeta;
this.parseMeta = parseMeta;
}
public Metadata getParseMeta() { return parseMeta; }
public void setParseMeta(Metadata parseMeta) {
this.parseMeta = parseMeta;
}
}
//==============================Content========================================
package org.apache.nutch.protocol;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.util.NutchConfiguration;
public final class Content implements Writable{
public static final String DIR_NAME = "content";
private int version;
private String url;
private String base;
private byte[] content; //网页内容(包括标签)的byte数组(binary content)
private String contentType;
private Metadata metadata;
private MimeUtil mimeTypes;
public Content() {
metadata = new Metadata();
}
public Content(String url, String base, byte[] content, String contentType,
Metadata metadata, Configuration conf) {
this.url = url;
this.base = base;
this.content = content;
this.metadata = metadata;
this.mimeTypes = new MimeUtil(conf);
this.contentType = getContentType(contentType, url, content);
}
/** The media type of the retrieved content.
* @see <a href="http://www.iana.org/assignments/media-types/">
* http://www.iana.org/assignments/media-types/</a>
*/
public String getContentType() {
return contentType;
}
public void setContentType(String contentType) {
this.contentType = contentType;
}
private String getContentType(String typeName, String url, byte[] data) {
return this.mimeTypes.autoResolveContentType(typeName, url, data);
}
}
//============================HtmlParser=========================================
package org.apache.nutch.parse.html;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.protocol.Content;
public class HtmlParser implements Parser {
public static final Log LOG = LogFactory.getLog("org.apache.nutch.parse.html");
// I used 1000 bytes at first, but found that some documents have
// meta tag well past the first 1000 bytes.
// (e.g. http://cn.promo.yahoo.com/customcare/music.html)
private static final int CHUNK_SIZE = 2000;
private static Pattern metaPattern =
Pattern.compile("<meta\\s+([^>]*http-equiv=\"?content-type\"?[^>]*)>",
Pattern.CASE_INSENSITIVE);
private static Pattern charsetPattern =
Pattern.compile("charset=\\s*([a-z][_\\-0-9a-z]*)",
Pattern.CASE_INSENSITIVE);
private String parserImpl;
private static String sniffCharacterEncoding(byte[] content) {
int length = content.length < CHUNK_SIZE ?
content.length : CHUNK_SIZE;
// We don't care about non-ASCII parts so that it's sufficient
// to just inflate each byte to a 16-bit value by padding.
// For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
// {U+0041, U+0082, U+00B7}.
String str = "";
try {
str = new String(content, 0, length,
Charset.forName("ASCII").toString());
} catch (UnsupportedEncodingException e) {
// code should never come here, but just in case...
return null;
}
Matcher metaMatcher = metaPattern.matcher(str);
String encoding = null;
if (metaMatcher.find()) {
Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
if (charsetMatcher.find())
encoding = new String(charsetMatcher.group(1));
}
return encoding;
}
private String defaultCharEncoding;
private Configuration conf;
private DOMContentUtils utils;
private HtmlParseFilters htmlParseFilters;
private String cachingPolicy;
/*
* 解析出text(出去标签后的文本) title(网页标题)outlinks (包含的链接)
*
*/
public ParseResult getParse(Content content) {
//这里的content是Content类的对象,它包含一个名叫content的byte数组,代表html网页文件内容
HTMLMetaTags metaTags = new HTMLMetaTags();
URL base;
try {
base = new URL(content.getBaseUrl());
} catch (MalformedURLException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
String text = "";
String title = "";
Outlink[] outlinks = new Outlink[0];
Metadata metadata = new Metadata();
// parse the content
DocumentFragment root;//html网页的DOM
try {
byte[] contentInOctets = content.getContent();//取得content对象中的字节数组byte[]content(代表html网页文件)
InputSource input = new InputSource(new ByteArrayInputStream(contentInOctets));
EncodingDetector detector = new EncodingDetector(conf);
detector.autoDetectClues(content, true);
detector.addClue(sniffCharacterEncoding(contentInOctets), "sniffed");
String encoding = detector.guessEncoding(content, defaultCharEncoding);
metadata.set(Metadata.ORIGINAL_CHAR_ENCODING, encoding);
metadata.set(Metadata.CHAR_ENCODING_FOR_CONVERSION, encoding);
input.setEncoding(encoding);
if (LOG.isTraceEnabled()) { LOG.trace("Parsing..."); }
root = parse(input);//解析html,返回DOM片段
} catch (IOException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (DOMException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (SAXException e) {
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
} catch (Exception e) {
e.printStackTrace(LogUtil.getWarnStream(LOG));
return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
}
// get meta directives
HTMLMetaProcessor.getMetaTags(metaTags, root, base);
if (LOG.isTraceEnabled()) {
LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
}
// check meta directives
if (!metaTags.getNoIndex()) { // okay to index
StringBuffer sb = new StringBuffer();
if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); }
utils.getText(sb, root); // 去除标签后的网页文本text
text = sb.toString();
sb.setLength(0);
if (LOG.isTraceEnabled()) { LOG.trace("Getting title..."); }
utils.getTitle(sb, root); //网页标题title
title = sb.toString().trim();
}
ParseData parseData = new ParseData(status, title, outlinks,
content.getMetadata(), metadata);
ParseResult parseResult = ParseResult.createParseResult(content.getUrl(),
new ParseImpl(text, parseData));
/**
* run filters on parse
* 调用每个filter进行处理,每次传入上一步处理完的parseResult返回filter处理后的parseResult
*/
ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult,
metaTags, root);
if (metaTags.getNoCache()) { // not okay to cache
for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY,
cachingPolicy);
}
return filteredParse;
}
//取得解析后的Dom文档片段
private DocumentFragment parse(InputSource input) throws Exception {
if (parserImpl.equalsIgnoreCase("tagsoup"))
return parseTagSoup(input);
else return parseNeko(input);
}
}
//================================HtmlParseFilters=============================================
package org.apache.nutch.parse;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.plugin.*;
import org.w3c.dom.DocumentFragment;
/** Creates and caches {@link HtmlParseFilter} implementing plugins.*/
public class HtmlParseFilters {
private HtmlParseFilter[] htmlParseFilters;
public static final String HTMLPARSEFILTER_ORDER = "htmlparsefilter.order";
/**
* Run all defined filters.
* @param content 网页元数据
* @param parseResult 上一个filter处理后的结果
* @param metaTags meta标签
* @param doc 页面的html DOM
* @return parseResult 本次处理后的结果
*/
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
// loop on each filter
for (int i = 0 ; i < this.htmlParseFilters.length; i++) {
// call filter interface
parseResult =
htmlParseFilters[i].filter(content, parseResult, metaTags, doc);
// any failure on parse obj, return
if (!parseResult.isSuccess()) {
// TODO: What happens when parseResult.isEmpty() ?
// Maybe clone parseResult and use parseResult as backup...
// remove failed parse before return
parseResult.filter();
return parseResult;
}
}
return parseResult;
}
}
类图:
时序图