//ParseData类表示网页的数据,包含标题,meta标签等等。。。。
package org.apache.nutch.parse;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.util.NutchConfiguration;
/** Data extracted from a page's content.
* @see Parse#getData()
*/
public final class ParseData extends VersionedWritable {
public static final String DIR_NAME = "parse_data";
private final static byte VERSION = 5;
private String title;
private Outlink[] outlinks;
private Metadata contentMeta;//存放源数据Map
private Metadata parseMeta;//存放源数据的Map
private ParseStatus status;
private byte version = VERSION;
public ParseData(ParseStatus status, String title, Outlink[] outlinks,
Metadata contentMeta) {
this(status, title, outlinks, contentMeta, new Metadata());//构造时没设置parseMeta所以后面有setParseMeta()
}
public ParseData(ParseStatus status, String title, Outlink[] outlinks,
Metadata contentMeta, Metadata parseMeta) {
this.status = status;
this.title = title;
this.outlinks = outlinks;
this.contentMeta = contentMeta;
this.parseMeta = parseMeta;
}
/** The original Metadata retrieved from content */
public Metadata getContentMeta() { return contentMeta; }//这个方法可获取contentMeta后调用其put()方法设值
/**
* Other content properties.
* This is the place to find format-specific properties.
* Different parser implementations for different content types will populate
* this differently.
*/
public Metadata getParseMeta() { return parseMeta; }//这个方法可获取parseMeta后调用其put()方法设值
public void setParseMeta(Metadata parseMeta) {//构造时没设置parseMeta所以有setParseMeta()
this.parseMeta = parseMeta;
}
/**
* Get a metadata single value.
* This method first looks for the metadata value in the parse metadata. If no
* value is found it the looks for the metadata in the content metadata.
* (先从parseMeta这个Map中找,没有再找contentMeta这个Map,找到后返回)
* @see #getContentMeta()
* @see #getParseMeta()
*/
public String getMeta(String name) {//先从parseMeta这个Map中找,没有再找contentMeta这个Map,找到后返回
String value = parseMeta.get(name);
if (value == null) {
value = contentMeta.get(name);
}
return value;
}
}
Parse接口:
package org.apache.nutch.parse;
/** The result of parsing a page's raw content.
* @see Parser#getParse(Content)
*/
public interface Parse {
/** The textual content of the page. This is indexed, searched, and used when
* generating snippets.*/
String getText();
/** Other data extracted from the page. */
ParseData getData();
/** Indicates if the parse is coming from a url or a sub-url */
boolean isCanonical();
}
ParseResult类:
package org.apache.nutch.parse;
public class ParseResult implements Iterable<Map.Entry<Text, Parse>> {
private Map<Text, Parse> parseMap;//保存多个Parse实例的Map
private String originalUrl;
public static ParseResult createParseResult(String url, Parse parse) {
ParseResult parseResult = new ParseResult(url);
parseResult.put(new Text(url), new ParseText(parse.getText()), parse.getData());//调用(1)
return parseResult;
}
public void put(Text key, ParseText text, ParseData data) {//(1)
put(key.toString(), text, data);//调用(2)
}
public void put(String key, ParseText text, ParseData data) {//(2)
parseMap.put(new Text(key), new ParseImpl(text, data, key.equals(originalUrl)));//存一条Parse实例到Map,key是页面url
}
}
PaseResult内包含多个Parse实例(具体为ParseImpl的实例,ParseImpl实现了Parse接口,ParseImpl代表一个网页parse后的数据(result of parsing a page's raw content))
实现了Parse接口的ParseImpl类有public ParseData getData() { return data; }这个方法,此方法返回的是ParseData类的实例