nutch2.3.1 updatejob时错误url导致崩溃

原因可能是错误的html解析出来的
在DbUpdateMapper.java的map时加个trycatch

 55  @Override
 56  public void map(String key, WebPage page, Context context)
 57      throws IOException, InterruptedException {
 58    if (Mark.GENERATE_MARK.checkMark(page) == null) {
 59      if (LOG.isDebugEnabled()) {
 60        LOG.debug("Skipping " + TableUtil.unreverseUrl(key)
 61            + "; not generated yet");
 62      }
 63      return;
 64    }
 65
 66    String url = TableUtil.unreverseUrl(key);
 67
 68    scoreData.clear();
 69    Map<CharSequence, CharSequence> outlinks = page.getOutlinks();
 70    if (outlinks != null) {
 71      for (Entry<CharSequence, CharSequence> e : outlinks.entrySet()) {
 72        int depth = Integer.MAX_VALUE;
 73        CharSequence depthUtf8 = page.getMarkers().get(DbUpdaterJob.DISTANCE);
 74        if (depthUtf8 != null)
 75          depth = Integer.parseInt(depthUtf8.toString());
           // add here to filter error url
 76        try {
 77            String testUrl = TableUtil.reverseUrl(e.getKey().toString());
 78        } catch (MalformedURLException ex) {
 79            LOG.warn("dbupdate,error url:" + e.getKey().toString());
 80            continue;
 81        }
 82        scoreData.add(new ScoreDatum(0.0f, e.getKey().toString(), e.getValue()
 83            .toString(), depth));
 84      }
 85    }
 86
 87    // TODO: Outlink filtering (i.e. "only keep the first n outlinks")
 88    try {
 89      scoringFilters.distributeScoreToOutlinks(url, page, scoreData,
 90          (outlinks == null ? 0 : outlinks.size()));
 91    } catch (ScoringFilterException e) {
 92      LOG.warn("Distributing score failed for URL: " + key + " exception:"
 93          + StringUtils.stringifyException(e));
 94    }
 95
 96    urlWithScore.setUrl(key);
 97    urlWithScore.setScore(Float.MAX_VALUE);
 98    pageWritable.setWebPage(page);
 99    nutchWritable.set(pageWritable);
100    context.write(urlWithScore, nutchWritable);
101
102    for (ScoreDatum scoreDatum : scoreData) {
103      String reversedOut = TableUtil.reverseUrl(scoreDatum.getUrl());
104      scoreDatum.setUrl(url);
105      urlWithScore.setUrl(reversedOut);
106      urlWithScore.setScore(scoreDatum.getScore());
107      nutchWritable.set(scoreDatum);
108      context.write(urlWithScore, nutchWritable);
109    }
110  }
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值