Apache doris元数据同步机制源码解析

最新推荐文章于 2024-05-27 17:26:59 发布

hf200012

最新推荐文章于 2024-05-27 17:26:59 发布

阅读量1.6k

点赞数

分类专栏： Doris 文章标签： apache java zookeeper

本文链接：https://blog.csdn.net/hf200012/article/details/117825649

版权

Doris 专栏收录该内容

101 篇文章 112 订阅

订阅专栏

Apache Doris 代码仓库地址：apache/incubator-doris 欢迎大家关注加星

名词解释

FE：Frontend，即 Doris 的前端节点。主要负责接收和返回客户端请求、元数据以及集群管理、查询计划生成等工作。
BE：Backend，即 Doris 的后端节点。主要负责数据存储与管理、查询计划执行等工作。
bdbje：Oracle Berkeley DB Java Edition。在 Doris 中，使用 bdbje 完成元数据操作日志的持久化、FE 高可用等功能。

Doris 的元数据主要存储4类数据：

用户数据信息。包括数据库、表的 Schema、分片信息等。
各类作业信息。如导入作业，Clone 作业、SchemaChange 作业等。
用户及权限信息。
集群及节点信息

元数据的数据流具体过程如下：

只有 leader FE 可以对元数据进行写操作。写操作在修改 leader 的内存后，会序列化为一条log，按照 key-value 的形式写入 bdbje。其中 key 为连续的整型，作为 log id，value 即为序列化后的操作日志。
日志写入 bdbje 后，bdbje 会根据策略（写多数/全写），将日志复制到其他 non-leader 的 FE 节点。non-leader FE 节点通过对日志回放，修改自身的元数据内存镜像，完成与 leader 节点的元数据同步。
leader 节点的日志条数达到阈值后（默认 10w 条），会启动 checkpoint 线程。checkpoint 会读取已有的 image 文件，和其之后的日志，重新在内存中回放出一份新的元数据镜像副本。然后将该副本写入到磁盘，形成一个新的 image。之所以是重新生成一份镜像副本，而不是将已有镜像写成 image，主要是考虑写 image 加读锁期间，会阻塞写操作。所以每次 checkpoint 会占用双倍内存空间。
image 文件生成后，leader 节点会通知其他 non-leader 节点新的 image 已生成。non-leader 主动通过 http 拉取最新的 image 文件，来更换本地的旧文件。
bdbje 中的日志，在 image 做完后，会定期删除旧的

源码解析

Doris FE启动步骤（只说核心的几个部分）：

Doris启动的时候首先去初始化Catalog，并等待Catalog完成
启动QeServer 这个是mysql client连接用的，端口是9030
启动FeServer这个是Thrift Server，主要是FE和BE之间通讯用的
启动HttpServer ，各种rest api接口及前端web界面

这里我们分析的是元数据这块只看Catalog初始化过程中做了什么事情

 PaloFe ——> start()
     // 初始化Catalog并等待初始化完成
     Catalog.getCurrentCatalog().initialize(args);
     Catalog.getCurrentCatalog().waitForReady();
         Catalog -->initialize() 
             第一步：获取本节点和Helper节点
             getSelfHostPort();
             getHelperNodes(args);
             第二步：检查和创建元数据目录及文件
             第三步：获取集群ID及角色（Observer和Follower）
             getClusterIdAndRole();
             第四步：首先加载image并回访editlog
             this.editLog = new EditLog(nodeName);
             loadImage(this.imageDir); // load image file
             editLog.open(); // open bdb env
             this.globalTransactionMgr.setEditLog(editLog);
             this.idGenerator.setEditLog(editLog);
             第五步：创建load和导出作业标签清理线程（这是一个MasterDaemon守护线程）
             createLabelCleaner()
             第六步：创建tnx清理线程
             createTxnCleaner();
             第七步：启动状态监听线程，这个线程主要是监听Master，Observer、Follower状态转换，及Observer和Follower元数据同步，Leader选举
             createStateListener();
             listener.start();

Load Job Label清理：createLabelCleaner

 //每个label_keep_max_second（默认三天），从idToLoadJob, dbToLoadJobs and dbLabelToLoadJobs删除旧的job，
 //包括从ExportMgr删除exportjob, exportJob 默认七天清理一次，控制参数history_job_keep_max_second
 //这个线程每个四个小时运行一次，是由label_clean_interval_second参数来控制
 public void createLabelCleaner() {
         labelCleaner = new MasterDaemon("LoadLabelCleaner", Config.label_clean_interval_second * 1000L) {
             @Override
             protected void runAfterCatalogReady() {              
                 load.removeOldLoadJobs();
                 loadManager.removeOldLoadJob();
                 exportMgr.removeOldExportJobs();
             }
         };
     }

事务(tnx)清理线程:createTxnCleaner()

 //定期清理过期的事务,默认30秒清理一次，控制参数：transaction_clean_interval_second
 //这里清理的是tnx状态是:
 //1.已过期：VISIBLE(可见) 或者 ABORTED（终止）, 并且 expired（已过期）
 //2.已超时：事务状态是：PREPARE, 但是 timeout
 //事务状态是：COMMITTED和 VISIBLE状态的不能被清除，只能成功
 public void createTxnCleaner() {
         txnCleaner = new MasterDaemon("txnCleaner", Config.transaction_clean_interval_second) {
             @Override
             protected void runAfterCatalogReady() {
                 globalTransactionMgr.removeExpiredAndTimeoutTxns();
             }
         };
     }

FE状态监听器线程 createStateListener()

这个线程主要是监听Master，Observer、Follower状态转换，及Observer和Follower元数据同步，Leader选举

定期检查，默认是100毫秒，参数：STATE_CHANGE_CHECK_INTERVAL_MS

 
     public void createStateListener() {
         listener = new Daemon("stateListener", STATE_CHANGE_CHECK_INTERVAL_MS) {
             @Override
             protected synchronized void runOneCycle() {
 
                 while (true) {
                     FrontendNodeType newType = null;
                     try {
                         newType = typeTransferQueue.take();
                     } catch (InterruptedException e) {
                         LOG.error("got exception when take FE type from queue", e);
                         Util.stdoutWithTime("got exception when take FE type from queue. " + e.getMessage());
                         System.exit(-1);
                     }
                     Preconditions.checkNotNull(newType);
                     LOG.info("begin to transfer FE type from {} to {}", feType, newType);
                     if (feType == newType) {
                         return;
                     }
 
                     /*
                      * INIT -> MASTER: transferToMaster
                      * INIT -> FOLLOWER/OBSERVER: transferToNonMaster
                      * UNKNOWN -> MASTER: transferToMaster
                      * UNKNOWN -> FOLLOWER/OBSERVER: transferToNonMaster
                      * FOLLOWER -> MASTER: transferToMaster
                      * FOLLOWER/OBSERVER -> INIT/UNKNOWN: set isReady to false
                      */
                     switch (feType) {
                         case INIT: {
                             switch (newType) {
                                 case MASTER: {
                                     transferToMaster();
                                     break;
                                 }
                                 case FOLLOWER:
                                 case OBSERVER: {
                                     transferToNonMaster(newType);
                                     break;
                                 }
                                 case UNKNOWN:
                                     break;
                                 default:
                                     break;
                             }
                             break;
                         }
                         case UNKNOWN: {
                             switch (newType) {
                                 case MASTER: {
                                     transferToMaster();
                                     break;
                                 }
                                 case FOLLOWER:
                                 case OBSERVER: {
                                     transferToNonMaster(newType);
                                     break;
                                 }
                                 default:
                                     break;
                             }
                             break;
                         }
                         case FOLLOWER: {
                             switch (newType) {
                                 case MASTER: {
                                     transferToMaster();
                                     break;
                                 }
                                 case UNKNOWN: {
                                     transferToNonMaster(newType);
                                     break;
                                 }
                                 default:
                                     break;
                             }
                             break;
                         }
                         case OBSERVER: {
                             switch (newType) {
                                 case UNKNOWN: {
                                     transferToNonMaster(newType);
                                     break;
                                 }
                                 default:
                                     break;
                             }
                             break;
                         }
                         case MASTER: {
                             // exit if master changed to any other type
                             String msg = "transfer FE type from MASTER to " + newType.name() + ". exit";
                             LOG.error(msg);
                             Util.stdoutWithTime(msg);
                             System.exit(-1);
                         }
                         default:
                             break;
                     } // end switch formerFeType
 
                     feType = newType;
                     LOG.info("finished to transfer FE type to {}", feType);
                 }
             } // end runOneCycle
         };
 
         listener.setMetaContext(metaContext);
     }

Leader的选举通过：

transferToNonMaster和transferToMaster

元数据同步方法： startMasterOnlyDaemonThreads，这个方法是启动Checkpoint守护线程，由Master定期朝各个Follower和Observer推送image，然后在有节点本地做Image回放，更新自己本节点的元数据，这个线程只在Master节点启动 startNonMasterDaemonThreads 启动其他守护线程在所有FE节点启动，这里包括TabletStatMgr、LabelCleaner、EsRepository、DomainResolver

 private void transferToNonMaster(FrontendNodeType newType) {
         isReady.set(false);
         if (feType == FrontendNodeType.OBSERVER || feType == FrontendNodeType.FOLLOWER) {
             Preconditions.checkState(newType == FrontendNodeType.UNKNOWN);
             LOG.warn("{} to UNKNOWN, still offer read service", feType.name());
             // not set canRead here, leave canRead as what is was.
             // if meta out of date, canRead will be set to false in replayer thread.
             metaReplayState.setTransferToUnknown();
             return;
         }
 
         // transfer from INIT/UNKNOWN to OBSERVER/FOLLOWER
         // add helper sockets
         if (Config.edit_log_type.equalsIgnoreCase("BDB")) {
             for (Frontend fe : frontends.values()) {
                 if (fe.getRole() == FrontendNodeType.FOLLOWER || fe.getRole() == FrontendNodeType.REPLICA) {
                     ((BDBHA) getHaProtocol()).addHelperSocket(fe.getHost(), fe.getEditLogPort());
                 }
             }
         }
 
         if (replayer == null) {
             //创建回放线程
             createReplayer();
             replayer.start();
         }
 
         // 'isReady' will be set to true in 'setCanRead()' method
         fixBugAfterMetadataReplayed(true);
         
         startNonMasterDaemonThreads();
 
         MetricRepo.init();
     }

创建editlog回放守护线程，这里主要是将Master推送的Image日志信息在本地进行回访，写到editlog中

 public void createReplayer() {
     replayer = new Daemon("replayer", REPLAY_INTERVAL_MS) {
         @Override
         protected void runOneCycle() {
             boolean err = false;
             boolean hasLog = false;
             try {
                 //进行image回放，重写本地editlog
                 hasLog = replayJournal(-1);
                 metaReplayState.setOk();
             } catch (InsufficientLogException insufficientLogEx) {
                 // 从以下成员中复制丢失的日志文件：拥有文件的复制组 
                 LOG.error("catch insufficient log exception. please restart.", insufficientLogEx);
                 NetworkRestore restore = new NetworkRestore();
                 NetworkRestoreConfig config = new NetworkRestoreConfig();
                 config.setRetainLogFiles(false);
                 restore.execute(insufficientLogEx, config);
                 System.exit(-1);
             } catch (Throwable e) {
                 LOG.error("replayer thread catch an exception when replay journal.", e);
                 metaReplayState.setException(e);
                 try {
                     Thread.sleep(5000);
                 } catch (InterruptedException e1) {
                     LOG.error("sleep got exception. ", e);
                 }
                 err = true;
             }
 
             setCanRead(hasLog, err);
         }
     };
     replayer.setMetaContext(metaContext);
 }

日志回放，重写本地editlog

 
     public synchronized boolean replayJournal(long toJournalId) {
         long newToJournalId = toJournalId;
         if (newToJournalId == -1) {
             newToJournalId = getMaxJournalId();
         }
         if (newToJournalId <= replayedJournalId.get()) {
             return false;
         }
 
         LOG.info("replayed journal id is {}, replay to journal id is {}", replayedJournalId, newToJournalId);
         JournalCursor cursor = editLog.read(replayedJournalId.get() + 1, newToJournalId);
         if (cursor == null) {
             LOG.warn("failed to get cursor from {} to {}", replayedJournalId.get() + 1, newToJournalId);
             return false;
         }
 
         long startTime = System.currentTimeMillis();
         boolean hasLog = false;
         while (true) {
             JournalEntity entity = cursor.next();
             if (entity == null) {
                 break;
             }
             hasLog = true;
             //生成新的editlog
             EditLog.loadJournal(this, entity);
             replayedJournalId.incrementAndGet();
             LOG.debug("journal {} replayed.", replayedJournalId);
             if (feType != FrontendNodeType.MASTER) {
                 journalObservable.notifyObservers(replayedJournalId.get());
             }
             if (MetricRepo.isInit) {
                 // Metric repo may not init after this replay thread start
                 MetricRepo.COUNTER_EDIT_LOG_READ.increase(1L);
             }
         }
         long cost = System.currentTimeMillis() - startTime;
         if (cost >= 1000) {
             LOG.warn("replay journal cost too much time: {} replayedJournalId: {}", cost, replayedJournalId);
         }
 
         return hasLog;
     }

只有角色为 Master 的 FE 才会主动定期生成 image 文件。每次生成完后，都会推送给其他非 Master 角色的 FE。当确认其他所有 FE 都收到这个 image 后，Master FE 会删除 bdbje 中旧的元数据 journal。所以，如果 image 生成失败，或者 image 推送给其他 FE 失败时，都会导致 bdbje 中的数据不断累积。

在Master节点日志中搜索你可以看到下面这个日志，一分钟一次

 2021-04-16 08:34:34,554 INFO (leaderCheckpointer|72) [BDBJEJournal.getFinalizedJournalId():410] database names: 52491702 
 2021-04-16 08:34:34,554 INFO (leaderCheckpointer|72) [Checkpoint.runAfterCatalogReady():81] checkpoint imageVersion 52491701, checkPointVersion 0

CheckPoint线程的启动只在Master Fe节点，在Catalog.startMasterOnlyDaemonThreads方法里启动的

在这里startMasterOnlyDaemonThreads方法里会在Master Fe 节点启动一个 TimePrinter 线程。该线程会定期向 bdbje 中写入一个当前时间的 key-value 条目。其余 non-leader 节点通过回放这条日志，读取日志中记录的时间，和本地时间进行比较，如果发现和本地时间的落后大于指定的阈值（配置项：meta_delay_toleration_second。写入间隔为该配置项的一半），则该节点会处于不可读的状态，当查询或者load等任务落到这节点的时候会报：failed to call frontend service异常。此机制解决了 non-leader 节点在长时间和 leader 失联后，仍然提供过期的元数据服务的问题。

所以这里整个集群是需要做NTP时间同步，保持各个节点时间一致，避免因为时间差异造成的服务不可用

 // start all daemon threads only running on Master
     private void startMasterOnlyDaemonThreads() {
         // start checkpoint thread
         checkpointer = new Checkpoint(editLog);
         checkpointer.setMetaContext(metaContext);
         // set "checkpointThreadId" before the checkpoint thread start, because the thread
         // need to check the "checkpointThreadId" when running.
         checkpointThreadId = checkpointer.getId();
 
         checkpointer.start();
         ....
         // time printer
         createTimePrinter();
         timePrinter.start();
         ....
         updateDbUsedDataQuotaDaemon.start();
     }

CheckPoint线程启动以后会定期向非Master FE推送Image日志信息，默认是一分钟，配置参数：checkpoint_interval_second

具体方法：runAfterCatalogReady

Master FE定期向非Master FE推送image日志信息
删除旧的journals：获取每个非Master节点的当前journal ID。删除bdb数据库时，不能删除比任何非Master节点的当前journal ID 更新的的db。否则此滞后节点将永远无法获取已删除的journal。
最后删除旧的image文件

         // push image file to all the other non master nodes
         // DO NOT get other nodes from HaProtocol, because node may not in bdbje replication group yet.
         List<Frontend> allFrontends = Catalog.getServingCatalog().getFrontends(null);
         int successPushed = 0;
         int otherNodesCount = 0;
         if (!allFrontends.isEmpty()) {
             otherNodesCount = allFrontends.size() - 1; // skip master itself
             for (Frontend fe : allFrontends) {
                 String host = fe.getHost();
                 if (host.equals(Catalog.getServingCatalog().getMasterIp())) {
                     // skip master itself
                     continue;
                 }
                 int port = Config.http_port;
                 
                 String url = "http://" + host + ":" + port + "/put?version=" + replayedJournalId
                         + "&port=" + port;
                 LOG.info("Put image:{}", url);
 
                 try {
                     MetaHelper.getRemoteFile(url, PUT_TIMEOUT_SECOND * 1000, new NullOutputStream());
                     successPushed++;
                 } catch (IOException e) {
                     LOG.error("Exception when pushing image file. url = {}", url, e);
                 }
             }
             
             LOG.info("push image.{} to other nodes. totally {} nodes, push succeed {} nodes",
                      replayedJournalId, otherNodesCount, successPushed);
         }
         
         // Delete old journals
         if (successPushed == otherNodesCount) {
             long minOtherNodesJournalId = Long.MAX_VALUE;
             long deleteVersion = checkPointVersion;
             if (successPushed > 0) {
                 for (Frontend fe : allFrontends) {
                     String host = fe.getHost();
                     if (host.equals(Catalog.getServingCatalog().getMasterIp())) {
                         // skip master itself
                         continue;
                     }
                     int port = Config.http_port;
                     URL idURL;
                     HttpURLConnection conn = null;
                     try {
                         /*
                          * get current replayed journal id of each non-master nodes.
                          * when we delete bdb database, we cannot delete db newer than
                          * any non-master node's current replayed journal id. otherwise,
                          * this lagging node can never get the deleted journal.
                          */
                         idURL = new URL("http://" + host + ":" + port + "/journal_id");
                         conn = (HttpURLConnection) idURL.openConnection();
                         conn.setConnectTimeout(CONNECT_TIMEOUT_SECOND * 1000);
                         conn.setReadTimeout(READ_TIMEOUT_SECOND * 1000);
                         String idString = conn.getHeaderField("id");
                         long id = Long.parseLong(idString);
                         if (minOtherNodesJournalId > id) {
                             minOtherNodesJournalId = id;
                         }
                     } catch (IOException e) {
                         LOG.error("Exception when getting current replayed journal id. host={}, port={}",
                                 host, port, e);
                         minOtherNodesJournalId = 0;
                         break;
                     } finally {
                         if (conn != null) {
                             conn.disconnect();
                         }
                     }
                 }
                 deleteVersion = Math.min(minOtherNodesJournalId, checkPointVersion);
             }
             //删除旧的Journal
             editLog.deleteJournals(deleteVersion + 1);
             if (MetricRepo.isInit) {
                 MetricRepo.COUNTER_IMAGE_PUSH.increase(1L);
             }
             LOG.info("journals <= {} are deleted. image version {}, other nodes min version {}", 
                      deleteVersion, checkPointVersion, minOtherNodesJournalId);
         }
         
         //删除旧的image文件
         MetaCleaner cleaner = new MetaCleaner(Config.meta_dir + "/image");
         try {
             cleaner.clean();
         } catch (IOException e) {
             LOG.error("Master delete old image file fail.", e);
         }

hf200012

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Apache doris元数据同步机制源码解析

名词解释 FE：Frontend，即 Doris 的前端节点。主要负责接收和返回客户端请求、元数据以及集群管理、查询计划生成等工作。 BE：Backend，即 Doris 的后端节点。主要负责数据存储与管理、查询计划执行等工作。 bdbje：Oracle Berkeley DB Java Edition。在 Doris 中，使用 bdbje 完成元数据操作日志的持久化、FE 高可用等功能。 Doris 的元数据主要存储4类数据：用户数据信息。包括数据库、表的 Schem
复制链接

扫一扫