(五)元数据的管理与开发

提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档


前言

void logEdit(final FSEditLogOp op) {

    synchronized (this) {
      assert isOpenForWrite() :
        "bad state: " + state;
      
      // wait if an automatic sync is scheduled
      //一开始不需要等待
      waitIfAutoSyncScheduled();
      //最重要的就是生成了全局唯一的事务ID(日志)
      
      // name    editlog_0000000000_0000000012.log
      //         editlog_0000000013_0000000022.log
      //TODO 步骤一:获取当前的独一无二的事务ID
      long start = beginTransaction();
      op.setTransactionId(txid);

      try {
    	  /**
    	   * 1) namenode editlog 文件缓冲里面
    	   * 2) journalnode的内存缓冲
    	   * 看到的只有一个editLogStream,其实背后是由多个流的
    	   *a)写namenode
    	   *b)写journalnode
    	   /
    	  //TODO 步骤二:把元数据写入到内存缓冲
        editLogStream.write(op);
      } catch (IOException ex) {
        // All journals failed, it is handled in logSync.
      } finally {
        op.reset();
      }

      endTransaction(start);
      
      // check if it is time to schedule an automatic sync
      // > 512 true  !true=false
      if (!shouldForceSync()) {
        return;
      }
     //TODO 如果到这儿就说明 缓冲区存满了
      isAutoSyncScheduled = true;
    } 
    //TODO 把数据持久化到磁盘
    logSync();
  }

editLogStream 初始化

    try {
      //TODO Namenode初始化的时候调用这段代码
      editLogStream = journalSet.startLogSegment(segmentTxId,
          NameNodeLayoutVersion.CURRENT_LAYOUT_VERSION);
    } catch (IOException ex) {
      throw new IOException("Unable to start log segment " +
          segmentTxId + ": too few journals successfully started.", ex);
    }
  @Override
  public EditLogOutputStream startLogSegment(final long txId,
      final int layoutVersion) throws IOException {
	  //闭包
    mapJournalsAndReportErrors(new JournalClosure() {
      @Override
      public void apply(JournalAndStream jas) throws IOException {
        jas.startLogSegment(txId, layoutVersion);
      }
    }, "starting log segment " + txId);
    //返回来的是JournalSetOutputStream
    //这个流里面肯定需要封装多个流
    //是因为刚刚大家都是看到  editLogStream.write(日志对象)
    //namenode
    //journalnode
    //Set集合
    return new JournalSetOutputStream();
  }
  
private void mapJournalsAndReportErrors(
      JournalClosure closure, String status) throws IOException{

    List<JournalAndStream> badJAS = Lists.newLinkedList();
    //journals里面我们知道有两个对象:
    //FileJouanlManager:针对是把数据写到namenode磁盘上面
    //QuorumJouanlManager:把数据写到journalnode上面
    // 数据结构 List<JournalAndStream> journals =new CopyOnWriteArrayList<JournalSet.JournalAndStream>();
    //读写分离的数据结构,读比较多,写比较少的数据结构
    for (JournalAndStream jas : journals) {
      try {
    	  //FileJouanlManager
        closure.apply(jas);
      } catch (Throwable t) {
        if (jas.isRequired()) {
          final String msg = "Error: " + status + " failed for required journal ("
            + jas + ")";
          LOG.fatal(msg, t);
          // If we fail on *any* of the required journals, then we must not
          // continue on any of the other journals. Abort them to ensure that
          // retry behavior doesn't allow them to keep going in any way.
          abortAllJournals();
          // the current policy is to shutdown the NN on errors to shared edits
          // dir. There are many code paths to shared edits failures - syncs,
          // roll of edits etc. All of them go through this common function 
          // where the isRequired() check is made. Applying exit policy here 
          // to catch all code paths.
          terminate(1, msg);
        } else {
          LOG.error("Error: " + status + " failed for (journal " + jas + ")", t);
          badJAS.add(jas);          
        }
      }
    }
    disableAndReportErrorOnJournals(badJAS);
    if (!NameNodeResourcePolicy.areResourcesAvailable(journals,
        minimumRedundantJournals)) {
      String message = status + " failed for too many journals";
      LOG.error("Error: " + message);
      throw new IOException(message);
    }
  }
  
上面的journals是一个读写分离的数据结构,它的add方法有两个方法进行调用;
  void add(JournalManager j, boolean required, boolean shared) {
    JournalAndStream jas = new JournalAndStream(j, required, shared);
    //所以这个journals里面现在是有两个对象:
    //FileJournalManager
    //quorumJounalManager
    journals.add(jas);
  }
private synchronized void initJournals(List<URI> dirs) {
    int minimumRedundantJournals = conf.getInt(
        DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_MINIMUM_KEY,
        DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_MINIMUM_DEFAULT);

    synchronized(journalSetLock) {
      journalSet = new JournalSet(minimumRedundantJournals);
      //重要的代码
      //dirs这个目录时候从HDFS的配置文件里面解析出来的。
      //
      for (URI u : dirs) {//配置文件
        boolean required = FSNamesystem.getRequiredNamespaceEditsDirs(conf)
            .contains(u);
        //如果发下你是本地文件系统
        if (u.getScheme().equals(NNStorage.LOCAL_URI_SCHEME)) {
          StorageDirectory sd = storage.getStorageDirectory(u);
          if (sd != null) {
        	  //如果是本地文件系统,就会创建一个FileJournalnodeManager对象
        	  //这个对象就是用来管理把元数据写入到namenode的
            journalSet.add(new FileJournalManager(conf, sd, storage),
                required, sharedEditsDirs.contains(u));
          }
        } else {
        	//如果不是本地文件系统会针对journalnode创建一个对象。
        	//这个对象就是用来管理把元数据写入到journlanode.
          journalSet.add(createJournal(u), required,
              sharedEditsDirs.contains(u));
        }
      }
    }

每个流都要调用apply方法,其实就是调用上一个函数的startLogSegment
有两个类实现了这个接口

  synchronized public EditLogOutputStream startLogSegment(long txid,
      int layoutVersion) throws IOException {
    try {
      currentInProgress = NNStorage.getInProgressEditsFile(sd, txid);
      //获取当前流
      //如果是FileJournalManager这个对象getCurrentStream 获取当前流
      //EditLogFileOutputStream  write
      EditLogOutputStream stm = new EditLogFileOutputStream(conf,
          currentInProgress, outputBufferCapacity);
      stm.create(layoutVersion);
      return stm;
    } catch (IOException e) {
      LOG.warn("Unable to start log segment " + txid +
          " at " + currentInProgress + ": " +
          e.getLocalizedMessage());
      errorReporter.reportErrorOnFile(currentInProgress);
      throw e;
    }
  }
  @Override
  public EditLogOutputStream startLogSegment(long txId, int layoutVersion)
      throws IOException {
    Preconditions.checkState(isActiveWriter,
        "must recover segments before starting a new one");
    QuorumCall<AsyncLogger, Void> q = loggers.startLogSegment(txId,
        layoutVersion);
    loggers.waitForWriteQuorum(q, startSegmentTimeoutMs,
        "startLogSegment(" + txId + ")");
    return new QuorumOutputStream(loggers, txId,
        outputBufferCapacity, writeTxnsTimeoutMs);
  }

所以logEdit的 步骤二:把元数据写入到内存缓冲;
上面两个类也同时实现了write的接口:

  @Override
  public void write(FSEditLogOp op) throws IOException {
	  //双缓冲
    doubleBuf.writeOp(op);
  }
  public void writeOp(FSEditLogOp op) throws IOException {
	  //把数据写入bufCurrent内存里面。
    bufCurrent.writeOp(op);
  }

同理 logEdit的最后步骤 //TODO 把数据持久化到磁盘logSync();在刷写磁盘的时候会调用到flush

  public void flush(boolean durable) throws IOException {
    numSync++;
    long start = monotonicNow();
    flushAndSync(durable);
    long end = monotonicNow();
    totalTimeSync += (end - start);
  }

其中 flushAndSync(durable);是一个接口由两个类实现该接口

  public void flushAndSync(boolean durable) throws IOException {
    if (fp == null) {
      throw new IOException("Trying to use aborted output stream");
    }
    if (doubleBuf.isFlushed()) {
      LOG.info("Nothing to flush");
      return;
    }
    preallocate(); // preallocate file if necessary
    doubleBuf.flushTo(fp);
    if (durable && !shouldSkipFsyncForTests && !shouldSyncWritesAndSkipFsync) {
      fc.force(false); // metadata updates not needed
    }
  }
  protected void flushAndSync(boolean durable) throws IOException {
    int numReadyBytes = buf.countReadyBytes();
    if (numReadyBytes > 0) {
      int numReadyTxns = buf.countReadyTxns();
      long firstTxToFlush = buf.getFirstReadyTxId();

      assert numReadyTxns > 0;

      // Copy from our double-buffer into a new byte array. This is for
      // two reasons:
      // 1) The IPC code has no way of specifying to send only a slice of
      //    a larger array.
      // 2) because the calls to the underlying nodes are asynchronous, we
      //    need a defensive copy to avoid accidentally mutating the buffer
      //    before it is sent.
      DataOutputBuffer bufToSend = new DataOutputBuffer(numReadyBytes);
      buf.flushTo(bufToSend);
      assert bufToSend.getLength() == numReadyBytes;
      byte[] data = bufToSend.getData();
      assert data.length == bufToSend.getLength();
      //把数据写入到journlanode
      QuorumCall<AsyncLogger, Void> qcall = loggers.sendEdits(
          segmentTxId, firstTxToFlush,
          numReadyTxns, data);
      //等待写入到journalnode集群处理结果
      loggers.waitForWriteQuorum(qcall, writeTimeoutMs, "sendEdits");
      
      // Since we successfully wrote this batch, let the loggers know. Any future
      // RPCs will thus let the loggers know of the most recent transaction, even
      // if a logger has fallen behind.
      loggers.setCommittedTxId(firstTxToFlush + numReadyTxns - 1);
    }
  }

loggers在之前的方法已经进行了初始化

  public QuorumCall<AsyncLogger, Void> sendEdits(
      long segmentTxId, long firstTxnId, int numTxns, byte[] data) {
    Map<AsyncLogger, ListenableFuture<Void>> calls = Maps.newHashMap();
    //遍历所有loggers
    //每个AsyncLogger对象代表就是一个journalnode
    
    for (AsyncLogger logger : loggers) {
      ListenableFuture<Void> future = 
    		  //往journalnode去发送日志。
        logger.sendEdits(segmentTxId, firstTxnId, numTxns, data);
      calls.put(logger, future);
    }
    return QuorumCall.create(calls);
  }

sendedit()的接口被以下实现:

public ListenableFuture<Void> sendEdits(
      final long segmentTxId, final long firstTxnId,
      final int numTxns, final byte[] data) {
    try {
      reserveQueueSpace(data.length);
    } catch (LoggerTooFarBehindException e) {
      return Futures.immediateFailedFuture(e);
    }
    
    // When this batch is acked, we use its submission time in order
    // to calculate how far we are lagging.
    final long submitNanos = System.nanoTime();
    
    ListenableFuture<Void> ret = null;
    try {
    	//异步发送
      ret = singleThreadExecutor.submit(new Callable<Void>() {
        @Override
        public Void call() throws IOException {
          throwIfOutOfSync();

          long rpcSendTimeNanos = System.nanoTime();
          try {
        	  //获取一个代理
        	  //这个是RPC的调用
        	  
        	  //journalnode 一定起服务什么?
        	  //             JournalRpcServer(RPC的客户端,获取服务端的代理)
            getProxy().journal(createReqInfo(),
                segmentTxId, firstTxnId, numTxns, data);
          } catch (IOException e) {
            QuorumJournalManager.LOG.warn(
                "Remote journal " + IPCLoggerChannel.this + " failed to " +
                "write txns " + firstTxnId + "-" + (firstTxnId + numTxns - 1) +
                ". Will try to write to this JN again after the next " +
                "log roll.", e); 
            synchronized (IPCLoggerChannel.this) {
              outOfSync = true;
            }
            throw e;
          } finally {
            long now = System.nanoTime();
            long rpcTime = TimeUnit.MICROSECONDS.convert(
                now - rpcSendTimeNanos, TimeUnit.NANOSECONDS);
            long endToEndTime = TimeUnit.MICROSECONDS.convert(
                now - submitNanos, TimeUnit.NANOSECONDS);
            metrics.addWriteEndToEndLatency(endToEndTime);
            metrics.addWriteRpcLatency(rpcTime);
            if (rpcTime / 1000 > WARN_JOURNAL_MILLIS_THRESHOLD) {
              QuorumJournalManager.LOG.warn(
                  "Took " + (rpcTime / 1000) + "ms to send a batch of " +
                  numTxns + " edits (" + data.length + " bytes) to " +
                  "remote journal " + IPCLoggerChannel.this);
            }
          }
          synchronized (IPCLoggerChannel.this) {
            highestAckedTxId = firstTxnId + numTxns - 1;
            lastAckNanos = submitNanos;
          }
          return null;
        }
      });
    } finally {
      if (ret == null) {
        // it didn't successfully get submitted,
        // so adjust the queue size back down.
        unreserveQueueSpace(data.length);
      } else {
        // It was submitted to the queue, so adjust the length
        // once the call completes, regardless of whether it
        // succeeds or fails.
        Futures.addCallback(ret, new FutureCallback<Void>() {
          @Override
          public void onFailure(Throwable t) {
            unreserveQueueSpace(data.length);
          }

          @Override
          public void onSuccess(Void t) {
            unreserveQueueSpace(data.length);
          }
        });
      }
    }
    return ret;
  }

通过远程调用journal方法

  public void journal(RequestInfo reqInfo,
      long segmentTxId, long firstTxnId,
      int numTxns, byte[] records) throws IOException {
    jn.getOrCreateJournal(reqInfo.getJournalId())
       .journal(reqInfo, segmentTxId, firstTxnId, numTxns, records);
  }

journaljournal(reqInfo, segmentTxId, firstTxnId, numTxns, records)方法里面存在讲消息刷写到磁盘

    curSegment.writeRaw(records, 0, records.length);
    curSegment.setReadyToFlush();
    StopWatch sw = new StopWatch();
    sw.start();
    curSegment.flush(shouldFsync);
    sw.stop();

StandbyCheckpointer

 * TODO StandbyCheckpointer 是一个运行在standBynamenode上的一个线程。
 * 他会周期性的对命名空间做checkpoint的操作(说白了就是把 内存里面目录树的信息持久化到磁盘上面)
 * 并且会把这个份数据上传到active namenode(用来替换 active namednoe上面的fsimage)
 * 
 * hadoop1:
 *   secondaryNamenode:
 *       如果要合并元数据,它是需要去namenode上面去读取数据的。然后
 *       在自己的合并里面合并,合并完了以后再去替换 namenode里面的磁盘上面fsimage
 *   standByNamenode:
 *       他要做chekpoint的时候是不需要去读取active namenode里面的元数据。
 *       元数据在他自己的内存里面,他本来就有。它要是想做checkpoint它只需要把
 *       自己内存里面的元数据写到磁盘上面就可以,然后把磁盘上面的fsimage
 *       上传到active namenode里面去替换 avtive namenode的fsimage文件就可以。
private void doWork() {
      final long checkPeriod = 1000 * checkpointConf.getCheckPeriod();
      // Reset checkpoint time so that we don't always checkpoint
      // on startup.
      lastCheckpointTime = monotonicNow();
      while (shouldRun) {
        boolean needRollbackCheckpoint = namesystem.isNeedRollbackFsImage();
        if (!needRollbackCheckpoint) {
          try {
        	  //TODO 每隔60检查以下是否需要做checkpoint
            Thread.sleep(checkPeriod);
          } catch (InterruptedException ie) {
          }
          if (!shouldRun) {
            break;
          }
        }
        try {
          // We may have lost our ticket since last checkpoint, log in again, just in case
          if (UserGroupInformation.isSecurityEnabled()) {
            UserGroupInformation.getCurrentUser().checkTGTAndReloginFromKeytab();
          }
          
          final long now = monotonicNow();
          //TODO checkpoint条件一
          //这儿是计算以下,我们上一次checkpoint 现在最新的数据差了多少数据?
          //或者说大概的意思就是说我们现在有多少条日志没有checkpoint了。
          final long uncheckpointed = countUncheckpointedTxns();
          //TODO checkpoint条件二
          //当前时间 - 上一次checkpoint的时间。
          //说白了这个变量代表的意思就是 已经有多久没有做checkpoint了。
          final long secsSinceLast = (now - lastCheckpointTime) / 1000;
          
          boolean needCheckpoint = needRollbackCheckpoint;
          if (needCheckpoint) {
            LOG.info("Triggering a rollback fsimage for rolling upgrade.");
            //TODO 条件一:如果超过100万条日志没有做checkpoint,那么就需要做一次。
          } else if (uncheckpointed >= checkpointConf.getTxnCount()) {
            LOG.info("Triggering checkpoint because there have been " + 
                uncheckpointed + " txns since the last checkpoint, which " +
                "exceeds the configured threshold " +
                checkpointConf.getTxnCount());
            needCheckpoint = true;
            //TODO 条件二: 如果超过一个小时没有做checkpoint了,那么需要做一次。
          } else if (secsSinceLast >= checkpointConf.getPeriod()) {
            LOG.info("Triggering checkpoint because it has been " +
                secsSinceLast + " seconds since the last checkpoint, which " +
                "exceeds the configured interval " + checkpointConf.getPeriod());
            needCheckpoint = true;
          }
          
          synchronized (cancelLock) {
            if (now < preventCheckpointsUntil) {
              LOG.info("But skipping this checkpoint since we are about to failover!");
              canceledCount++;
              continue;
            }
            assert canceler == null;
            canceler = new Canceler();
          }
          //TODO 满足条件
          if (needCheckpoint) {
        	  //TODO 执行checkpoint
            doCheckpoint();
            // reset needRollbackCheckpoint to false only when we finish a ckpt
            // for rollback image
            if (needRollbackCheckpoint
                && namesystem.getFSImage().hasRollbackFSImage()) {
              namesystem.setCreatedRollbackImages(true);
              namesystem.setNeedRollbackFsImage(false);
            }
            lastCheckpointTime = now;
          }
        } catch (SaveNamespaceCancelledException ce) {
          LOG.info("Checkpoint was cancelled: " + ce.getMessage());
          canceledCount++;
        } catch (InterruptedException ie) {
          LOG.info("Interrupted during checkpointing", ie);
          // Probably requested shutdown.
          continue;
        } catch (Throwable t) {
          LOG.error("Exception in doCheckpoint", t);
        } finally {
          synchronized (cancelLock) {
            canceler = null;
          }
        }
      }
    }

EditLogTailer

 * TODO EditLogTailer是一个后台线程,启动了以后会周期性的去journalnode集群上面去
 * 读取元数据日志,然后再把这些元数据日志应用到自己的元数据里面(内存+磁盘)
 */
    private void doWork() {
      while (shouldRun) {
        try {
          if (tooLongSinceLastLoad() &&
              lastRollTriggerTxId < lastLoadedTxnId) {
        	  //回滚
            triggerActiveLogRoll();
          }
          if (!shouldRun) {
            break;
          }
          namesystem.cpLockInterruptibly();
          try {
        	  //TODO 重要的代码
            doTailEdits();
          } finally {
            namesystem.cpUnlock();
          }
        } catch (EditLogInputException elie) {
          LOG.warn("Error while reading edits from disk. Will try again.", elie);
        } catch (InterruptedException ie) {
          // interrupter should have already set shouldRun to false
          continue;
        } catch (Throwable t) {
          LOG.fatal("Unknown error encountered while tailing edits. " +
              "Shutting down standby NN.", t);
          terminate(1, t);
        }
        try {
          //TODO 每隔60秒 StandByNameNode 去Journalnode获取一下日志
          Thread.sleep(sleepTimeMs);
        } catch (InterruptedException e) {
          LOG.warn("Edit log tailer interrupted", e);
        }
      }
    }
  }
void doTailEdits() throws IOException, InterruptedException {
    // Write lock needs to be interruptible here because the 
    // transitionToActive RPC takes the write lock before calling
    // tailer.stop() -- so if we're not interruptible, it will
    // deadlock.
    namesystem.writeLockInterruptibly();
    try {
    	//TODO 加载当前自己的元数据日志
      FSImage image = namesystem.getFSImage();
      //TODO 获取当前日志的最后一条日志的事务ID是多少
      //1000
      long lastTxnId = image.getLastAppliedTxId();
      
      if (LOG.isDebugEnabled()) {
        LOG.debug("lastTxnId: " + lastTxnId);
      }
      Collection<EditLogInputStream> streams;
      try {
    	  //这个地方是重要的代码
    	  //需要去journlanode上面去读取元数据
    	  //我现在的事务id 1000,所以我去journlanode上面去读取
    	  //日志的时候,只需要去读取 1001后面的日志就可以。
        //TODO 设置获取Journalnode获取日志的流
        streams = editLog.selectInputStreams(lastTxnId + 1, 0, null, false);
      } catch (IOException ioe) {
        // This is acceptable. If we try to tail edits in the middle of an edits
        // log roll, i.e. the last one has been finalized but the new inprogress
        // edits file hasn't been started yet.
        LOG.warn("Edits tailer failed to find any streams. Will try again " +
            "later.", ioe);
        return;
      }
      if (LOG.isDebugEnabled()) {
        LOG.debug("edit streams to load from: " + streams.size());
      }
      
      // Once we have streams to load, errors encountered are legitimate cause
      // for concern, so we don't catch them here. Simple errors reading from
      // disk are ignored.
      long editsLoaded = 0;
      try {
    	  //TODO 去Journalnode加载日志
        editsLoaded = image.loadEdits(streams, namesystem);
      } catch (EditLogInputException elie) {
        editsLoaded = elie.getNumEditsLoaded();
        throw elie;
      } finally {
        if (editsLoaded > 0 || LOG.isDebugEnabled()) {
          LOG.info(String.format("Loaded %d edits starting from txid %d ",
              editsLoaded, lastTxnId));
        }
      }

      if (editsLoaded > 0) {
        lastLoadTimeMs = monotonicNow();
      }
      lastLoadedTxnId = image.getLastAppliedTxId();
    } finally {
      namesystem.writeUnlock();
    }
  }

TODO 去Journalnode加载日志

 private long loadEdits(Iterable<EditLogInputStream> editStreams,
      FSNamesystem target, StartupOption startOpt, MetaRecoveryContext recovery)
      throws IOException {
    LOG.debug("About to load edits:\n  " + Joiner.on("\n  ").join(editStreams));
    StartupProgress prog = NameNode.getStartupProgress();
    prog.beginPhase(Phase.LOADING_EDITS);
    
    long prevLastAppliedTxId = lastAppliedTxId;  
    try {
    	
      //TODO 构建了FSEDitLogLoader
      FSEditLogLoader loader = new FSEditLogLoader(target, lastAppliedTxId);
      
      // Load latest edits
      for (EditLogInputStream editIn : editStreams) {
        LOG.info("Reading " + editIn + " expecting start txid #" +
              (lastAppliedTxId + 1));
        try {
        	//TODO 加载日志
          loader.loadFSEdits(editIn, lastAppliedTxId + 1, startOpt, recovery);
        } finally {
          // Update lastAppliedTxId even in case of error, since some ops may
          // have been successfully applied before the error.
          //TODO 记录最后的一个事务ID
          //1000 -> 2000
          lastAppliedTxId = loader.getLastAppliedTxId();
        }
        // If we are in recovery mode, we may have skipped over some txids.
        if (editIn.getLastTxId() != HdfsConstants.INVALID_TXID) {
          lastAppliedTxId = editIn.getLastTxId();
        }
      }
    } finally {
      FSEditLog.closeAllStreams(editStreams);
      // update the counts
      updateCountForQuota(target.getBlockManager().getStoragePolicySuite(),
          target.dir.rootDir);
    }
    prog.endPhase(Phase.LOADING_EDITS);
    return lastAppliedTxId - prevLastAppliedTxId;
  }

long loadFSEdits(EditLogInputStream edits, long expectedStartingTxId,
      StartupOption startOpt, MetaRecoveryContext recovery) throws IOException {
    StartupProgress prog = NameNode.getStartupProgress();
    Step step = createStartupProgressStep(edits);
    prog.beginStep(Phase.LOADING_EDITS, step);
    fsNamesys.writeLock();
    try {
      long startTime = monotonicNow();
      FSImage.LOG.info("Start loading edits file " + edits.getName());
      //TODO 重要代码
      long numEdits = loadEditRecords(edits, false, expectedStartingTxId,
          startOpt, recovery);
      FSImage.LOG.info("Edits file " + edits.getName() 
          + " of size " + edits.length() + " edits # " + numEdits 
          + " loaded in " + (monotonicNow()-startTime)/1000 + " seconds");
      return numEdits;
    } finally {
      edits.close();
      fsNamesys.writeUnlock();
      prog.endStep(Phase.LOADING_EDITS, step);
    }
  }
 long loadEditRecords(EditLogInputStream in, boolean closeOnExit,
      long expectedStartingTxId, StartupOption startOpt,
      MetaRecoveryContext recovery) throws IOException {
    FSDirectory fsDir = fsNamesys.dir;

    EnumMap<FSEditLogOpCodes, Holder<Integer>> opCounts =
      new EnumMap<FSEditLogOpCodes, Holder<Integer>>(FSEditLogOpCodes.class);

    if (LOG.isTraceEnabled()) {
      LOG.trace("Acquiring write lock to replay edit log");
    }

    fsNamesys.writeLock();
    fsDir.writeLock();

    long recentOpcodeOffsets[] = new long[4];
    Arrays.fill(recentOpcodeOffsets, -1);
    
    long expectedTxId = expectedStartingTxId;
    long numEdits = 0;
    long lastTxId = in.getLastTxId();
    long numTxns = (lastTxId - expectedStartingTxId) + 1;
    StartupProgress prog = NameNode.getStartupProgress();
    Step step = createStartupProgressStep(in);
    prog.setTotal(Phase.LOADING_EDITS, step, numTxns);
    Counter counter = prog.getCounter(Phase.LOADING_EDITS, step);
    long lastLogTime = monotonicNow();
    long lastInodeId = fsNamesys.dir.getLastInodeId();
    
    try {
      while (true) {
        try {
          FSEditLogOp op;
          try {
            /**
             *
             * TODO 通过一个输入流去读取数据
             *  op有可能是上传文件的日志
             *  也有可能是创建目录的日志
             *  因为我们这次分析的是创建目录
             *
             */
            op = in.readOp();
            if (op == null) {
              break;
            }
          } catch (Throwable e) {
            // Handle a problem with our input
            check203UpgradeFailure(in.getVersion(true), e);
            String errorMessage =
              formatEditLogReplayError(in, recentOpcodeOffsets, expectedTxId);
            FSImage.LOG.error(errorMessage, e);
            if (recovery == null) {
               // We will only try to skip over problematic opcodes when in
               // recovery mode.
              throw new EditLogInputException(errorMessage, e, numEdits);
            }
            MetaRecoveryContext.editLogLoaderPrompt(
                "We failed to read txId " + expectedTxId,
                recovery, "skipping the bad section in the log");
            in.resync();
            continue;
          }
          recentOpcodeOffsets[(int)(numEdits % recentOpcodeOffsets.length)] =
            in.getPosition();
          if (op.hasTransactionId()) {
            if (op.getTransactionId() > expectedTxId) { 
              MetaRecoveryContext.editLogLoaderPrompt("There appears " +
                  "to be a gap in the edit log.  We expected txid " +
                  expectedTxId + ", but got txid " +
                  op.getTransactionId() + ".", recovery, "ignoring missing " +
                  " transaction IDs");
            } else if (op.getTransactionId() < expectedTxId) { 
              MetaRecoveryContext.editLogLoaderPrompt("There appears " +
                  "to be an out-of-order edit in the edit log.  We " +
                  "expected txid " + expectedTxId + ", but got txid " +
                  op.getTransactionId() + ".", recovery,
                  "skipping the out-of-order edit");
              continue;
            }
          }
          try {
            if (LOG.isTraceEnabled()) {
              LOG.trace("op=" + op + ", startOpt=" + startOpt
                  + ", numEdits=" + numEdits + ", totalEdits=" + totalEdits);
            }
            //TODO 把获取到的元数据作用到自己的元数据里
            long inodeId = applyEditLogOp(op, fsDir, startOpt,
                in.getVersion(true), lastInodeId);
            if (lastInodeId < inodeId) {
              lastInodeId = inodeId;
            }
          } catch (RollingUpgradeOp.RollbackException e) {
            throw e;
          } catch (Throwable e) {
            LOG.error("Encountered exception on operation " + op, e);
            if (recovery == null) {
              throw e instanceof IOException? (IOException)e: new IOException(e);
            }

            MetaRecoveryContext.editLogLoaderPrompt("Failed to " +
             "apply edit log operation " + op + ": error " +
             e.getMessage(), recovery, "applying edits");
          }
          // Now that the operation has been successfully decoded and
          // applied, update our bookkeeping.
          incrOpCount(op.opCode, opCounts, step, counter);
          if (op.hasTransactionId()) {
            lastAppliedTxId = op.getTransactionId();
            expectedTxId = lastAppliedTxId + 1;
          } else {
            expectedTxId = lastAppliedTxId = expectedStartingTxId;
          }
          // log progress
          if (op.hasTransactionId()) {
            long now = monotonicNow();
            if (now - lastLogTime > REPLAY_TRANSACTION_LOG_INTERVAL) {
              long deltaTxId = lastAppliedTxId - expectedStartingTxId + 1;
              int percent = Math.round((float) deltaTxId / numTxns * 100);
              LOG.info("replaying edit log: " + deltaTxId + "/" + numTxns
                  + " transactions completed. (" + percent + "%)");
              lastLogTime = now;
            }
          }
          numEdits++;
          totalEdits++;
        } catch (RollingUpgradeOp.RollbackException e) {
          LOG.info("Stopped at OP_START_ROLLING_UPGRADE for rollback.");
          break;
        } catch (MetaRecoveryContext.RequestStopException e) {
          MetaRecoveryContext.LOG.warn("Stopped reading edit log at " +
              in.getPosition() + "/"  + in.length());
          break;
        }
      }
    } finally {
      fsNamesys.dir.resetLastInodeId(lastInodeId);
      if(closeOnExit) {
        in.close();
      }
      fsDir.writeUnlock();
      fsNamesys.writeUnlock();

      if (LOG.isTraceEnabled()) {
        LOG.trace("replaying edit log finished");
      }

      if (FSImage.LOG.isDebugEnabled()) {
        dumpOpCounts(opCounts);
      }
    }
    return numEdits;
  }

applyEditLogOp里面有这个关键代码,更新自己内存的日志,与前面相似:

 。。。。。。.。。..。。....。。
 //TODO 创建目录的日志
    case OP_MKDIR: {
    	//根据匹配规则我们这次的日志
    	//应该是一个创建目录的日志。
      MkdirOp mkdirOp = (MkdirOp)op;
      inodeId = getAndUpdateLastInodeId(mkdirOp.inodeId, logVersion,
          lastInodeId);
      //TODO 把数据作用于自己的元数据里面。
      FSDirMkdirOp.mkdirForEditLog(fsDir, inodeId,
          renameReservedPathsOnUpgrade(mkdirOp.path, logVersion),
          mkdirOp.permissions, mkdirOp.aclEntries, mkdirOp.timestamp);
      break;
    }

到这里反思以下,它是如何读到日志文件的呢

在上面loadEditRecords()方法里面存在:

try {
            /**
             *
             * TODO 通过一个输入流去读取数据
             *  op有可能是上传文件的日志
             *  也有可能是创建目录的日志
             *  因为我们这次分析的是创建目录
             *
             */
            op = in.readOp();
  public FSEditLogOp readOp() throws IOException {
    FSEditLogOp ret;
    if (cachedOp != null) {
      ret = cachedOp;
      cachedOp = null;
      return ret;
    }
    //TODO 重要
    return nextOp();
  }

本地调用的是这个方法:

  @Override
  protected FSEditLogOp nextOp() throws IOException {
    //TODO 重要
    return nextOpImpl(false);
  }
  private FSEditLogOp nextOpImpl(boolean skipBrokenEdits) throws IOException {
    FSEditLogOp op = null;
    switch (state) {
    case UNINIT:
      try {
        init(true);
      } catch (Throwable e) {
        LOG.error("caught exception initializing " + this, e);
        if (skipBrokenEdits) {
          return null;
        }
        Throwables.propagateIfPossible(e, IOException.class);
      }
      Preconditions.checkState(state != State.UNINIT);
      return nextOpImpl(skipBrokenEdits);
    case OPEN:
        //TODO 通过reader读取日志
      op = reader.readOp(skipBrokenEdits);
      if ((op != null) && (op.hasTransactionId())) {
        long txId = op.getTransactionId();
        if ((txId >= lastTxId) &&
            (lastTxId != HdfsConstants.INVALID_TXID)) {
          //
          // Sometimes, the NameNode crashes while it's writing to the
          // edit log.  In that case, you can end up with an unfinalized edit log
          // which has some garbage at the end.
          // JournalManager#recoverUnfinalizedSegments will finalize these
          // unfinished edit logs, giving them a defined final transaction 
          // ID.  Then they will be renamed, so that any subsequent
          // readers will have this information.
          //
          // Since there may be garbage at the end of these "cleaned up"
          // logs, we want to be sure to skip it here if we've read everything
          // we were supposed to read out of the stream.
          // So we force an EOF on all subsequent reads.
          //
          long skipAmt = log.length() - tracker.getPos();
          if (skipAmt > 0) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("skipping " + skipAmt + " bytes at the end " +
                  "of edit log  '" + getName() + "': reached txid " + txId +
                  " out of " + lastTxId);
            }
            tracker.clearLimit();
            IOUtils.skipFully(tracker, skipAmt);
          }
        }
      }
      break;
      case CLOSED:
        break; // return null
    }
    return op;
  }

其实reader封装的就是一个http,通过http去进行数据的读取,其实就是一个RPC,直接远程调用getjournal()

总体流程

在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值