parse的主要工作是对抓取的网页进行解析,解析出其中的text,outlinks和metadata等信息。
首先:
package org.apache.nutch.parse;
Job
同样先从它的job开始看起。
currentJob = new NutchJob(getConf(), "parse");
Collection<WebPage.Field> fields = getFields(currentJob);
MapFieldValueFilter<String, WebPage> batchIdFilter = getBatchIdFilter(batchId);
StorageUtils.initMapperJob(currentJob, fields, String.class, WebPage.class,
ParserMapper.class, batchIdFilter);
StorageUtils.initReducerJob(currentJob, IdentityPageReducer.class);
currentJob.setNumReduceTasks(0);
currentJob.waitForCompletion(true);
这个好玩,一般的job的Mapper和Reducer比如generate都是GeneratorMappper
和GeneratorReducer
。可是这个呢,一个是ParserMapper
,另一个是IdentityPageReducer
。
那么我们就分别来看看他们俩吧。其实细心的你已经注意到这句话了currentJob.setNumReduceTasks(0)
,也就是说,那个IdentityPageReducer
其实并没有什么x用。
ParserMapper
public static class ParserMapper extends
GoraMapper<String, WebPage, String, WebPage> {
private ParseUtil parseUtil; /**解析工具*/
private boolean shouldResume; /**上次未完成的本次继续吗?*/
private boolean force; /**强制解析已经解析过的*/
private Utf8 batchId; /**批次号*/
private boolean skipTruncated; /**是否跳过被截断的页面*/
@Override
public void map(String key, WebPage page, Context context)
throws IOException, InterruptedException {
String unreverseKey = TableUtil.unreverseUrl(key);
if (batchId.equals(REPARSE)) {
LOG.debug("Reparsing " + unreverseKey);
} else {
if (Mark.FETCH_MARK.checkMark(page) == null) { /**还没有经过fetch*/
if (LOG.isDebugEnabled()) {
LOG.debug("Skipping " + TableUtil.unreverseUrl(key)
+ "; not fetched yet");
}
return;
}
if (shouldResume && Mark.PARSE_MARK.checkMark(page) != null) { /**有了解析过的标识并且需要重新继续上次未完成的解析*/
if (force) {
LOG.info("Forced parsing " + unreverseKey + "; already parsed");
} else {
LOG.info("Skipping " + unreverseKey + "; already parsed");
return;
}
} else {
LOG.info("Parsing " + unreverseKey);
}
}
if (skipTruncated && isTruncated(unreverseKey, page)) { /**如果需要跳过被截断的页面*/
return;
}
parseUtil.process(key, page);
ParseStatus pstatus = page.getParseStatus();
if (pstatus != null) {
context.getCounter("ParserStatus",
ParseStatusCodes.majorCodes[pstatus.getMajorCode()]).increment(1);
}
context.write(key, page); /**Map的输出*/
}
}
其实大家可以看看LOG中的info,然后这段代码是干嘛用的就一目了然。可以看出来map的输出为<String(key), WebPage(page)>
。然后最核心的处理过程被这个方法parseUtil.process(key, page)
一笔带过了,然后我们来看看这个方法到底是个什么鬼。然后map中还有一个判断网页内容是否被截断的一个方法,我们待会再看。
parseUtil.process(key, page)
public void process(String key, WebPage page) {
String url = TableUtil.unreverseUrl(key); /**逆向反转Url,即得到正常的Url*/
byte status = page.getStatus().byteValue();
if (status != CrawlStatus.STATUS_FETCHED) { /**如果状态不是已经fetch则跳过*/
if (LOG.isDebugEnabled()) {
LOG.debug("Skipping " + url + " as status is: "
+ CrawlStatus.getName(status));
}
return;
}
Parse parse;
try {
parse = parse(url, page);
} catch (ParserNotFound e) {
return;
} catch (final Exception e) {
return;
}
if (parse == null) {
return;
}
org.apache.nutch.storage.ParseStatus pstatus = parse.getParseStatus();
page.setParseStatus(pstatus);
if (ParseStatusUtils.isSuccess(pstatus)) {
if (pstatus.getMinorCode() == ParseStatusCodes.SUCCESS_REDIRECT) { /**处理重新定向的Url*/
String newUrl = ParseStatusUtils.getMessage(pstatus); /**newUrl是pstatus的args中的第一个参数*/
int refreshTime = Integer.parseInt(ParseStatusUtils.getArg(pstatus, 1));
try {
newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
if (newUrl == null) {
LOG.warn("redirect normalized to null " + url);
return;
}
try {
newUrl = filters.filter(newUrl);
} catch (URLFilterException e) {
return;
}
if (newUrl == null) {
LOG.warn("redirect filtered to null " + url);
return;
}
} catch (MalformedURLException e) {
return;
}
page.getOutlinks().put(new Utf8(newUrl), new Utf8());
page.getMetadata().put(FetcherJob.REDIRECT_DISCOVERED,
TableUtil.YES_VAL);
if (newUrl == null || newUrl.equals(url)) {
String reprUrl = URLUtil.chooseRepr(url, newUrl,
refreshTime < FetcherJob.PERM_REFRESH_TIME);
if (reprUrl == null) {
LOG.warn("reprUrl==null for " + url);
return;
} else {
page.setReprUrl(new Utf8(reprUrl));
}
}
} else { /**ParseStatusCodes.SUCCESS_OK*/
/**
public class Parse {
private String text;
private String title;
private Outlink[] outlinks;
private org.apache.nutch.storage.ParseStatus parseStatus;
}
*/
page.setText(new Utf8(parse.getText())); /**设置网页中的文本内容*/
page.setTitle(new Utf8(parse.getTitle())); /**设置网页的名称*/
ByteBuffer prevSig = page.getSignature(); /**prevSignature*/
if (prevSig != null) {
page.setPrevSignature(prevSig);
}
final byte[] signature = sig.calculate(page); /**默认为org.apache.nutch.crawl.MD5Signature*/
page.setSignature(ByteBuffer.wrap(signature));
if (page.getOutlinks() != null) {
page.getOutlinks().clear();
}
final Outlink[] outlinks = parse.getOutlinks();
int outlinksToStore = Math.min(maxOutlinks, outlinks.length); /**最大存储个数*/
String fromHost;
if (ignoreExternalLinks) {
try {
fromHost = new URL(url).getHost().toLowerCase();
} catch (final MalformedURLException e) {
fromHost = null;
}
} else {
fromHost = null;
}
int validCount = 0;
for (int i = 0; validCount < outlinksToStore && i < outlinks.length; i++) {
String toUrl = outlinks[i].getToUrl();
try {
toUrl = normalizers.normalize(toUrl, URLNormalizers.SCOPE_OUTLINK); /**规范化*/
toUrl = filters.filter(toUrl); /**过滤*/
} catch (MalformedURLException e2) {
continue;
} catch (URLFilterException e) {
continue;
}
if (toUrl == null) {
continue;
}
Utf8 utf8ToUrl = new Utf8(toUrl);
if (page.getOutlinks().get(utf8ToUrl) != null) { /**跳过重复的*/
// skip duplicate outlinks
continue;
}
String toHost;
if (ignoreExternalLinks) { /**忽略外链*/
try {
toHost = new URL(toUrl).getHost().toLowerCase();
} catch (final MalformedURLException e) {
toHost = null;
}
if (toHost == null || !toHost.equals(fromHost)) { // external links
continue; // skip it
}
}
validCount++;
page.getOutlinks().put(utf8ToUrl, new Utf8(outlinks[i].getAnchor())); /**添加外链*/
}
Utf8 fetchMark = Mark.FETCH_MARK.checkMark(page); /**标记已被fetch*/
if (fetchMark != null) {
Mark.PARSE_MARK.putMark(page, fetchMark); /**此页面已被fetch*/
}
}
}
}
值得一提的是webpage的signature是用来进行重复的检测和去除的。具体网页是怎么来完成解析的呢?还是得靠插件。插件的详细情况得看$NUTCH_HOME/conf/parse-plugins.xml
。
boolean isTruncated(String, WebPage)
public static boolean isTruncated(String url, WebPage page) {
ByteBuffer content = page.getContent();
if (content == null) { /**如果页面内容为空*/
return false;
}
CharSequence lengthUtf8 = page.getHeaders().get( /**获得Header中记录的页面长度*/
new Utf8(HttpHeaders.CONTENT_LENGTH));
if (lengthUtf8 == null) {
return false;
}
String lengthStr = lengthUtf8.toString().trim();
if (StringUtil.isEmpty(lengthStr)) {
return false;
}
int inHeaderSize;
try {
inHeaderSize = Integer.parseInt(lengthStr); /**过五关斩六将获得的Header中记录的页面的长度*/
} catch (NumberFormatException e) {
LOG.warn("Wrong contentlength format for " + url, e);
return false;
}
int actualSize = content.limit(); /**获得实际的页面长度*/
if (inHeaderSize > actualSize) { /**如果“官方”长度 > 实际长度,则页面被截断*/
LOG.warn(url + " skipped. Content of size " + inHeaderSize
+ " was truncated to " + actualSize);
return true;
}
if (LOG.isDebugEnabled()) {
LOG.debug(url + " actualSize=" + actualSize + " inHeaderSize="
+ inHeaderSize);
}
return false;
}
可以看出来这个方法主要就是将网页内容的实际长度和官方长度做个比较来判断是否被截断了。
现在map已经看完了,接下来就看看reduce。接上面的话茬,虽然IdentityPageReducer
可能并没有什么用,但是形式还是要走的。
IdentityPageReducer
public class IdentityPageReducer extends
GoraReducer<String, WebPage, String, WebPage> {
@Override
protected void reduce(String key, Iterable<WebPage> values, Context context)
throws IOException, InterruptedException {
for (WebPage page : values) {
context.write(key, page);
}
}
}
这么看来,果然没有什么用。
References
……