【无标题】

2021SC@SDUSC

HtmlParser::getParse->getMetaTags

public static final void getMetaTags(HTMLMetaTags metaTags, Node node,
URL currURL) {

metaTags.reset();
getMetaTagsHelper(metaTags, node, currURL);

}

private static final void getMetaTagsHelper(HTMLMetaTags metaTags, Node node,
URL currURL) {

if (node.getNodeType() == Node.ELEMENT_NODE) {

  if ("body".equalsIgnoreCase(node.getNodeName())) {
    return;
  }

  if ("meta".equalsIgnoreCase(node.getNodeName())) {
    NamedNodeMap attrs = node.getAttributes();
    Node nameNode = null;
    Node equivNode = null;
    Node contentNode = null;
    for (int i = 0; i < attrs.getLength(); i++) {
      Node attr = attrs.item(i);
      String attrName = attr.getNodeName().toLowerCase();
      if (attrName.equals("name")) {
        nameNode = attr;
      } else if (attrName.equals("http-equiv")) {
        equivNode = attr;
      } else if (attrName.equals("content")) {
        contentNode = attr;
      }
    }

    ...

  } else if ("base".equalsIgnoreCase(node.getNodeName())) {

    ...

  }
}

NodeList children = node.getChildNodes();
if (children != null) {
  int len = children.getLength();
  for (int i = 0; i < len; i++) {
    getMetaTagsHelper(metaTags, children.item(i), currURL);
  }
}

}

getMetaTagsHelper函数的整体思路就是嵌套遍历由NekoHTML解析出的各个节点,根据每个节点的信息对HTMLMetaTags进行相应的设置。
例如,如果是body节点就直接返回,如果是meta节点,则继续找到meta节点下的name节点、http-equiv节点、content节点,然后根据每个节点的信息设置HTMLMetaTags,省略掉的代码就是根据每个节点的值设置HTMLMetaTags,因为和主线代码没有大关系,所以不仔细看了。
getMetaTagsHelper函数的最后,获得某个节点下的所有子节点,并对其调用getMetaTagsHelper嵌套执行。

HtmlParser::getParse->getText

public void getText(StringBuffer sb, Node node) {
getText(sb, node, false);
}

public boolean getText(StringBuffer sb, Node node,
boolean abortOnNestedAnchors) {
if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
return true;
}
return false;
}

private boolean getTextHelper(StringBuffer sb, Node node,
boolean abortOnNestedAnchors, int anchorDepth) {
boolean abort = false;
NodeWalker walker = new NodeWalker(node);

while (walker.hasNext()) {

  Node currentNode = walker.nextNode();
  String nodeName = currentNode.getNodeName();
  short nodeType = currentNode.getNodeType();

  if ("script".equalsIgnoreCase(nodeName)) {
    walker.skipChildren();
  }
  if ("style".equalsIgnoreCase(nodeName)) {
    walker.skipChildren();
  }
  if (nodeType == Node.COMMENT_NODE) {
    walker.skipChildren();
  }
  if (nodeType == Node.TEXT_NODE) {
    String text = currentNode.getNodeValue();
    text = text.replaceAll("\\s+", " ");
    text = text.trim();
    if (text.length() > 0) {
      if (sb.length() > 0)
        sb.append(' ');
      sb.append(text);
    }
  }
}

return abort;

}

getText函数用于获取html文件内的文本内容,其内部调用了getTextHelper函数。NodeWalker用于封装node,用来深度优先遍历node树。接下来遍历所有节点,忽略标签名为script、style以及注释的节点以及其所有子节点,针对节点类型为TEXT_NODE,获取节点内的文本内容至传入的参数sb中。

HtmlParser::getParse->getTitle

public boolean getTitle(StringBuffer sb, Node node) {

NodeWalker walker = new NodeWalker(node);

while (walker.hasNext()) {

  Node currentNode = walker.nextNode();
  String nodeName = currentNode.getNodeName();
  short nodeType = currentNode.getNodeType();

  if ("body".equalsIgnoreCase(nodeName)) {
    return false;
  }

  if (nodeType == Node.ELEMENT_NODE) {
    if ("title".equalsIgnoreCase(nodeName)) {
      getText(sb, currentNode);
      return true;
    }
  }
}

return false;

}

参考getText函数的分析,首先通过NodeWalker遍历所有节点,如果到达标签为body对应的节点还没找到title就直接返回了,如果找到了title节点,就调用getText函数获取该节点下的所有文本内容至sb中并返回。

HtmlParser::getParse->getOutlinks

public void getOutlinks(URL base, ArrayList outlinks, Node node) {

NodeWalker walker = new NodeWalker(node);
while (walker.hasNext()) {

  Node currentNode = walker.nextNode();
  String nodeName = currentNode.getNodeName();
  short nodeType = currentNode.getNodeType();
  NodeList children = currentNode.getChildNodes();
  int childLen = (children != null) ? children.getLength() : 0;

  if (nodeType == Node.ELEMENT_NODE) {

    nodeName = nodeName.toLowerCase();
    LinkParams params = (LinkParams) linkParams.get(nodeName);
    if (params != null) {
      if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {

        StringBuffer linkText = new StringBuffer();
        getText(linkText, currentNode, true);
        if (linkText.toString().trim().length() == 0) {
          NodeWalker subWalker = new NodeWalker(currentNode);
          while (subWalker.hasNext()) {
            Node subNode = subWalker.nextNode();
            if (subNode.getNodeType() == Node.ELEMENT_NODE) {
              if (subNode.getNodeName().toLowerCase().equals("img")) {
                NamedNodeMap subAttrs = subNode.getAttributes();
                Node alt = subAttrs.getNamedItem("alt");
                if (alt != null) {
                  String altTxt = alt.getTextContent();
                  if (altTxt != null && altTxt.trim().length() > 0) {
                    if (linkText.length() > 0)
                      linkText.append(' ');
                    linkText.append(altTxt);
                  }
                }
              } else {

              }
            } else if (subNode.getNodeType() == Node.TEXT_NODE) {
              String txt = subNode.getTextContent();
              if (txt != null && txt.length() > 0) {
                if (linkText.length() > 0)
                  linkText.append(' ');
                linkText.append(txt);
              }
            }
          }
        }

        NamedNodeMap attrs = currentNode.getAttributes();
        String target = null;
        boolean noFollow = false;
        boolean post = false;
        for (int i = 0; i < attrs.getLength(); i++) {
          Node attr = attrs.item(i);
          String attrName = attr.getNodeName();
          if (params.attrName.equalsIgnoreCase(attrName)) {
            target = attr.getNodeValue();
          } else if ("rel".equalsIgnoreCase(attrName)
              && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
            noFollow = true;
          } else if ("method".equalsIgnoreCase(attrName)
              && "post".equalsIgnoreCase(attr.getNodeValue())) {
            post = true;
          }
        }
        if (target != null && !noFollow && !post)
          try {

            URL url = URLUtil.resolveURL(base, target);
            outlinks.add(new Outlink(url.toString(), linkText.toString()
                .trim()));
          } catch (MalformedURLException e) {

          }
      }
      if (params.childLen == 0)
        continue;
    }
  }
}

}

getOutlinks的整体思路是遍历所有节点,找到特定节点的(例如a标签、img标签等等)特定属性(例如a标签的href属性、img标签的src属性)里的地址,并查看有无辅助信息,例如img标签的alt、标签内部的一些文字等等,最后将url地址和辅助信息封装成Outlink添加到outlinks列表中。
linkParams成员变量中保存了DOMContentUtils要处理哪些类型的标签,以及需要过滤的每个标签的属性,其在DOMContentUtils的setConf函数中被赋值,后面来看。
getText函数从某个节点获取对应的文本内容作为url的辅助信息linkText,如果为空,则遍历子节点,从子节点下img标签(如果存在)的alt中获取,或者从类型为TEXT_NODE的子节点下获取。
再往下就从对应节点下的属性里获取url地址并添加到outlinks列表中,但是,如果某个标签下存在属性rel其值为nofollow,或者属性method其值为post(例如表单form),则不添加该url地址。

DOMContentUtils::setConf

public void setConf(Configuration conf) {
Collection forceTags = new ArrayList(1);

this.conf = conf;
linkParams.clear();
linkParams.put("a", new LinkParams("a", "href", 1));
linkParams.put("area", new LinkParams("area", "href", 0));
if (conf.getBoolean("parser.html.form.use_action", true)) {
  linkParams.put("form", new LinkParams("form", "action", 1));
  if (conf.get("parser.html.form.use_action") != null)
    forceTags.add("form");
}
linkParams.put("frame", new LinkParams("frame", "src", 0));
linkParams.put("iframe", new LinkParams("iframe", "src", 0));
linkParams.put("script", new LinkParams("script", "src", 0));
linkParams.put("link", new LinkParams("link", "href", 0));
linkParams.put("img", new LinkParams("img", "src", 0));

String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags");
for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) {
  if (!forceTags.contains(ignoreTags[i]))
    linkParams.remove(ignoreTags[i]);
}

}

从该函数可以看出,DOMContentUtils默认获取a标签的href、area标签的href、form标签的action、frame标签的src、iframe标签的src、script标签的src、link标签的href、img标签的src这几个属性下的url地址,如果要取消对某个标签的url地址,可以设置parser.html.outlinks.ignore_tags属性。

分析完map函数后,下面开始分析ParseOutputFormat中的getRecordWriter函数,该函数创建一个RecordWriter用来处理输出。

RecordWriter::RecordWriter

public RecordWriter<Text, Parse> getRecordWriter(FileSystem fs, JobConf job,
String name, Progressable progress) throws IOException {

...

Path out = FileOutputFormat.getOutputPath(job);

Path text = new Path(new Path(out, ParseText.DIR_NAME), name);
Path data = new Path(new Path(out, ParseData.DIR_NAME), name);
Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name);

...

final MapFile.Writer textOut = new MapFile.Writer(job, text,
    tKeyClassOpt, tValClassOpt, tCompOpt, tProgressOpt);

...

final MapFile.Writer dataOut = new MapFile.Writer(job, data,
    dKeyClassOpt, dValClassOpt, dCompOpt, dProgressOpt);

final SequenceFile.Writer crawlOut = SequenceFile.createWriter(job, SequenceFile.Writer.file(crawl),
    SequenceFile.Writer.keyClass(Text.class),
    SequenceFile.Writer.valueClass(CrawlDatum.class),
    SequenceFile.Writer.bufferSize(fs.getConf().getInt("io.file.buffer.size",4096)),
    SequenceFile.Writer.replication(fs.getDefaultReplication(crawl)),
    SequenceFile.Writer.blockSize(1073741824),
    SequenceFile.Writer.compression(compType, new DefaultCodec()),
    SequenceFile.Writer.progressable(progress),
    SequenceFile.Writer.metadata(new Metadata())); 

return new RecordWriter<Text, Parse>();

}

RecordWriter的构造函数在crawl/segments/2*目录下创建parse_text、parse_data和crawl_parse文件夹,然后创建对应的输入流textOut、dataOut和crawlOut。当有数据到达时,会调用RecordWriter的write函数写。

RecordWriter::write

public void write(Text key, Parse parse) throws IOException {

    String fromUrl = key.toString();
    String origin = null;
    textOut.append(key, new ParseText(parse.getText()));

    ParseData parseData = parse.getData();
    String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY);
    if (sig != null) {
      byte[] signature = StringUtil.fromHexString(sig);
      if (signature != null) {
        CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0);
        d.setSignature(signature);
        crawlOut.append(key, d);
      }
    }

    ...

    Outlink[] links = parseData.getOutlinks();
    int outlinksToStore = Math.min(maxOutlinks, links.length);

    int validCount = 0;
    CrawlDatum adjust = null;
    List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, CrawlDatum>>(
        outlinksToStore);
    List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
    for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
      String toUrl = links[i].getToUrl();

      if (!isParsing) {
        toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, origin,
            ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, exemptionFilters, normalizers);
        if (toUrl == null) {
          continue;
        }
      }

      CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval);
      Text targetUrl = new Text(toUrl);
      MapWritable outlinkMD = links[i].getMetadata();
      if (outlinkMD != null) {
        target.getMetaData().putAll(outlinkMD);
      }

      scfilters.initialScore(targetUrl, target);
      targets.add(new SimpleEntry(targetUrl, target));
      links[i].setUrl(toUrl);
      outlinkList.add(links[i]);
      validCount++;
    }

    adjust = scfilters.distributeScoreToOutlinks(key, parseData, targets,
        null, links.length);
    for (Entry<Text, CrawlDatum> target : targets) {
      crawlOut.append(target.getKey(), target.getValue());
    }
    if (adjust != null)
      crawlOut.append(key, adjust);

    Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList
        .size()]);
    parseData = new ParseData(parseData.getStatus(), parseData.getTitle(),
        filteredLinks, parseData.getContentMeta(), parseData.getParseMeta());
    dataOut.append(key, parseData);

    ...

  }

}

write函数依次向parse_text中写入html的内容,向crawl_parse中写入签名信息,向crawl_parse写入crawlDB的信息。
再往下,遍历获取到的url地址列表,调用ParseOutputFormat.filterNormalize过滤掉没用的url,例如.css、.img等url。然后创建SimpleEntry封装targetUrl和target,其中targetUrl是url地址,target的类型为CrawlDatum,封装了对应url地址的信息,例如何时被抓取,抓取状态,初始分数等等。
接下来计算需要调整的分数adjust,然后将adjust和url对应的信息target写入crawl_parse中,然后重写ParseData,替换原来的ParseData中的连接信息,因为前面对该连接进行了过滤,最后将ParseData写入parse_data中。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值