前一篇文章已经讲到对于没有使用表函数的情况,pool->getManyChecked()这个方法是重点。
对于这个方法的返回值可以看一下,TryResult的定义如下:
struct TryResult {
TryResult() = default;
explicit TryResult(Entry entry_)
: entry(std::move(entry_)), is_usable(true), is_up_to_date(true) {
}
void reset() {
entry = Entry();
is_usable = false;
is_up_to_date = false;
staleness = 0.0;
}
Entry entry;
bool is_usable = false; /// If false, the entry is unusable for current request 对当前连接请求不可用, 但可能对别的连接请求可用, 所以此时不会将错误计数增加
/// (but may be usable for other requests, so error counts are not incremented)
bool is_up_to_date = false; /// If true, the entry is a connection to up-to-date replica. true表示entry连接的是最新的副本
double staleness = 0.0; /// Helps choosing the "least stale" option when all replicas are stale. 如果所有的副本都不是最新的, 则根据staleness选择比较新的
};
其中Entry代表的应该是连接,其余的表示这个连接的一些参数,包括是否可用,是否是最新的数据,以及过期程度。
getManyChecked()方法的实现如下:
std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::getManyChecked(
const Settings *settings, PoolMode pool_mode, const QualifiedTableName &table_to_check) {
TryGetEntryFunc try_get_entry = [&](NestedPool &pool, std::string &fail_message) {
return tryGetEntry(pool, fail_message, settings, &table_to_check);
};
return getManyImpl(settings, pool_mode, try_get_entry);
}
1.1 tryGetEntry()方法定义如下:
ConnectionPoolWithFailover::TryResult
ConnectionPoolWithFailover::tryGetEntry(
IConnectionPool &pool,
std::string &fail_message,
const Settings *settings,
const QualifiedTableName *table_to_check) {
TryResult result;
try {
result.entry = pool.get(settings, /* force_connected = */ false);
UInt64 server_revision = 0;
//指定了表名
if (table_to_check)
server_revision = result.entry->getServerRevision();
//没有指定表名 或者 server版本较低
if (!table_to_check || server_revision < DBMS_MIN_REVISION_WITH_TABLES_STATUS) {
result.entry->forceConnected();
result.is_usable = true;
result.is_up_to_date = true;
return result;
}
/// Only status of the remote table corresponding to the Distributed table is taken into account.
// 只考虑与分布式表相对应的远程物理表的状态
/// TODO: request status for joined tables also.
TablesStatusRequest status_request;
status_request.tables.emplace(*table_to_check);
//将需要查询的物理表封装成TablesStatusRequest发送给Server端, 获取这个物理表的状态, 保存在TablesStatusResponse中
//TablesStatusResponse的table_states_by_id这个map中保存了物理表的状态, map的键是库名表名, map的值是表的状态TablesStatus
TablesStatusResponse status_response = result.entry->getTablesStatus(status_request);
auto table_status_it = status_response.table_states_by_id.find(*table_to_check);
if (table_status_it == status_response.table_states_by_id.end()) {//没有这个表
fail_message = "There is no table " + table_to_check->database + "." + table_to_check->table
+ " on server: " + result.entry->getDescription();
LOG_WARNING(log, fail_message);
ProfileEvents::increment(ProfileEvents::DistributedConnectionMissingTable);
return result;
}
//如果有这个表, 则is_usable = true
result.is_usable = true;
//max_replica_delay_for_distributed_queries=300s
//如果设置了max_replica_delay_for_distributed_queries, 则在复制表的分布式查询中将选择复制延迟小于指定值的服务器(单位: 秒; 不包括等于)
UInt64 max_allowed_delay = settings ? UInt64(settings->max_replica_delay_for_distributed_queries) : 0;
//max_allowed_delay=0, 表示不考虑延迟, 则默认表中的数据总是最新的, 设置is_up_to_date = true
if (!max_allowed_delay) {
result.is_up_to_date = true;
return result;
}
//max_allowed_delay != 0, 表示需要考虑延迟, 表的绝对延迟小于最大允许延迟时才设置is_up_to_date = true
UInt32 delay = table_status_it->second.absolute_delay;
if (delay < max_allowed_delay)
result.is_up_to_date = true;
else { //否则设置is_up_to_date = false, 并设置staleness = delay
result.is_up_to_date = false;
result.staleness = delay;
LOG_TRACE(
log, "Server " << result.entry->getDescription() << " has unacceptable replica delay "
<< "for table " << table_to_check->database << "." << table_to_check->table
<< ": " << delay);
ProfileEvents::increment(ProfileEvents::DistributedConnectionStaleReplica);
}
}
catch (const Exception &e) {
if (e.code() != ErrorCodes::NETWORK_ERROR && e.code() != ErrorCodes::SOCKET_TIMEOUT
&& e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
throw;
fail_message = getCurrentExceptionMessage(/* with_stacktrace = */ false);
if (!result.entry.isNull()) {
result.entry->disconnect();
result.reset();
}
}
return result;
}
1.1.1getTablesStatus(status_request)定义如下:
TablesStatusResponse Connection::getTablesStatus(const TablesStatusRequest &request) {
if (!connected)
connect();
TimeoutSetter timeout_setter(*socket, sync_request_timeout, true);
writeVarUInt(Protocol::Client::TablesStatusRequest, *out);
request.write(*out, server_revision);
out->next();
//到这一步会向各个服务器发送TablesStatusRequest类型的数据包, 各服务器接收到数据包后根据数据包类型进行处理.
//转到TCPHandler.cpp的第153行 -> 第557行receivePacket()方法 -> 第595行 -> 第432行processTablesStatusRequest()方法进行处理
UInt64 response_type = 0;
readVarUInt(response_type, *in);
if (response_type == Protocol::Server::Exception)
receiveException()->rethrow();
else if (response_type != Protocol::Server::TablesStatusResponse)
throwUnexpectedPacket(response_type, "TablesStatusResponse");
TablesStatusResponse response;
response.read(*in, server_revision);
return response;
}
1.1.1.1 TCPHandler中processTablesStatusRequest()方法定义如下:
void TCPHandler::processTablesStatusRequest() {
TablesStatusRequest request;
request.read(*in, client_revision);
TablesStatusResponse response;
for (const QualifiedTableName &table_name: request.tables) {
StoragePtr table = connection_context.tryGetTable(table_name.database, table_name.table);
if (!table)
continue;
TableStatus status;
if (auto *replicated_table = dynamic_cast<StorageReplicatedMergeTree *>(table.get())) {
status.is_replicated = true;
status.absolute_delay = replicated_table->getAbsoluteDelay();
} else
status.is_replicated = false;
response.table_states_by_id.emplace(table_name, std::move(status));
}
writeVarUInt(Protocol::Server::TablesStatusResponse, *out);
response.write(*out, client_revision);
}
1.1.1.1.1 getAbsoluteDelay()方法的定义如下:
time_t StorageReplicatedMergeTree::getAbsoluteDelay() const
{
time_t min_unprocessed_insert_time = 0;//队列中日志的最小的插入时间
time_t max_processed_insert_time = 0; //队列中日志的最大的插入时间
queue.getInsertTimes(min_unprocessed_insert_time, max_processed_insert_time);
/// Load start time, then finish time to avoid reporting false delay when start time is updated between loading of two variables.
time_t queue_update_start_time = last_queue_update_start_time.load();//队列最近一次更新的开始时间
time_t queue_update_finish_time = last_queue_update_finish_time.load();//队列最近一次更新的结束时间
time_t current_time = time(nullptr);//返回的是从纪元开始至今秒数的整数类型值
if (!queue_update_finish_time)//如果队列最近一次的更新一直没结束, 表示正在向当前队列中加操作日志, 则认为延迟时间是无穷大
{
/// We have not updated queue even once yet (perhaps replica is readonly).
/// As we have no info about the current state of replication log, return effectively infinite delay.
return current_time;
}
else if (min_unprocessed_insert_time)
{
//队列中有未处理的日志, 如果当前时间 > 插入的时间, 表示队列中由未处理的日志, 两者的差值就是延迟时间;
//否则延迟时间=0
/// There are some unprocessed insert entries in queue.
return (current_time > min_unprocessed_insert_time) ? (current_time - min_unprocessed_insert_time) : 0;
}//到这里表示队列中没有未处理的日志了
else if (queue_update_start_time > queue_update_finish_time)
{
/// Queue is empty, but there are some in-flight or failed queue update attempts
/// (likely because of problems with connecting to ZooKeeper).
/// Return the time passed since last attempt.
//队列为空, 但有一些正在运行或失败的队列更新尝试(可能是由于连接到ZooKeeper时出现问题)。
// 返回上次尝试后经过的时间。
return (current_time > queue_update_start_time) ? (current_time - queue_update_start_time) : 0;
}
else
{
//队列为空 且 没有其他问题, 则认为没有延迟, 当前副本是最新的
/// Everything is up-to-date.
return 0;
}
}
1.2 getManyImpl()方法定义如下:
//这里涉及到了负载均衡策略
std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::getManyImpl(
const Settings *settings,
PoolMode pool_mode,
const TryGetEntryFunc &try_get_entry) {
//skip_unavailable_shards = false, 所以min_entries=1
size_t min_entries = (settings && settings->skip_unavailable_shards) ? 0 : 1;
size_t max_entries;
if (pool_mode == PoolMode::GET_ALL) {
min_entries = nested_pools.size();
max_entries = nested_pools.size();
} else if (pool_mode == PoolMode::GET_ONE)
max_entries = 1;
else if (pool_mode == PoolMode::GET_MANY)
//max_parallel_replicas=1, 执行查询时每个分片所使用的最大副本数. 对于一致性(获取同一分区的不同部分), 此选项仅适用于指定的采样密钥. 副本的延迟不受控制
//max_parallel_replicas = 1, 所以max_entries=1
max_entries = settings ? size_t(settings->max_parallel_replicas) : 1;
else
throw DB::Exception("Unknown pool allocation mode", DB::ErrorCodes::LOGICAL_ERROR);
GetPriorityFunc get_priority;
//load_balancing = RANDOM, 所以这个get_priority对所有的连接池应该都一样
switch (settings ? LoadBalancing(settings->load_balancing) : default_load_balancing) {
case LoadBalancing::NEAREST_HOSTNAME:
get_priority = [&](size_t i) { return hostname_differences[i]; };
break;
case LoadBalancing::IN_ORDER:
get_priority = [](size_t i) { return i; };
break;
case LoadBalancing::RANDOM:
break;
case LoadBalancing::FIRST_OR_RANDOM:
get_priority = [](size_t i) -> size_t { return i >= 1; };
break;
}
//fallback_to_stale_replicas_for_distributed_queries=1, 即fallback_to_stale_replicas=true
bool fallback_to_stale_replicas = settings ? bool(settings->fallback_to_stale_replicas_for_distributed_queries)
: true;
//调用getMany()这个方法
return Base::getMany(min_entries, max_entries, try_get_entry, get_priority, fallback_to_stale_replicas);
}
1.2.1 getMany()方法定义如下:
//这个方法是比较基础的方法
template<typename TNestedPool>
std::vector<typename PoolWithFailoverBase<TNestedPool>::TryResult>
PoolWithFailoverBase<TNestedPool>::getMany(
size_t min_entries, size_t max_entries,
const TryGetEntryFunc &try_get_entry,
const GetPriorityFunc &get_priority,
bool fallback_to_stale_replicas) {
/// Update random numbers and error counts.
PoolStates pool_states = updatePoolStates();//得到所有连接池的状态(是副本, 详见第131行)
//设置PoolStates里每个PoolState的优先级
//load_balancing = RANDOM, 所以这个get_priority对所有的连接池应该都一样
if (get_priority) {
for (size_t i = 0; i < pool_states.size(); ++i)
pool_states[i].priority = get_priority(i);
}
struct ShuffledPool {
NestedPool *pool;
const PoolState *state;
size_t index;
size_t error_count = 0;
};
/// Sort the pools into order in which they will be tried (based on respective PoolStates).
//根据nested_pools和pool_states构建ShuffledPool
std::vector<ShuffledPool> shuffled_pools;
shuffled_pools.reserve(nested_pools.size());
for (size_t i = 0; i < nested_pools.size(); ++i)
shuffled_pools.push_back(ShuffledPool{nested_pools[i].get(), &pool_states[i], i, 0});
//将ShuffledPool中的元素排序(按照PoolState将多个NestedPool排序)
//根据error_count、priority、random比较两个PoolState. 优先选错误少的, 然后是优先级高的, 最后随机数
std::sort(
shuffled_pools.begin(), shuffled_pools.end(),
[](const ShuffledPool &lhs, const ShuffledPool &rhs) {
return PoolState::compare(*lhs.state, *rhs.state);
});
/// We will try to get a connection from each pool until a connection is produced or max_tries is reached.
//从每个连接池获取一个连接. (成功获取到一个连接后停止 或者 尝试最大次数后停止)
std::vector<TryResult> try_results(shuffled_pools.size());
size_t entries_count = 0;
size_t usable_count = 0;
size_t up_to_date_count = 0;
size_t failed_pools_count = 0;
/// At exit update shared error counts with error counts occured during this call.
SCOPE_EXIT(
{
std::lock_guard lock(pool_states_mutex);
for (const ShuffledPool &pool: shuffled_pools)
shared_pool_states[pool.index].error_count += pool.error_count;
});
std::string fail_messages;
bool finished = false;
while (!finished) {
for (size_t i = 0; i < shuffled_pools.size(); ++i) {
if (up_to_date_count >= max_entries /// Already enough good entries. 已经找到合适的entry
|| entries_count + failed_pools_count >=
nested_pools.size()) /// No more good entries will be produced. 没有合适的entry
{
finished = true;
break;
}
ShuffledPool &shuffled_pool = shuffled_pools[i];
TryResult &result = try_results[i];
//还没有构造TryResult, 如果此时已经有当前这个连接池的error_count>=max_tries 或者 result.entry.isNull()=false, 忽略这个连接池
if (shuffled_pool.error_count >= max_tries ||
!result.entry.isNull())
continue;
//表示当前这个连接池的错误数<max_tries 且 result.entry.isNull()=true(因为还没构造, 所以连接为空)
std::string fail_message;
//根据pool和fail_message构造TryResult
result = try_get_entry(*shuffled_pool.pool, fail_message);
if (!fail_message.empty())
fail_messages += fail_message + '\n';
//针对新构造的TryResult,此时result.entry.isNull()=false
if (!result.entry.isNull()) {
++entries_count;
if (result.is_usable) {
++usable_count;
if (result.is_up_to_date)
++up_to_date_count;
}
} else {
LOG_WARNING(log, "Connection failed at try №"
<< (shuffled_pool.error_count + 1) << ", reason: " << fail_message);
ProfileEvents::increment(ProfileEvents::DistributedConnectionFailTry);
++shuffled_pool.error_count;
if (shuffled_pool.error_count >= max_tries) {
++failed_pools_count;
ProfileEvents::increment(ProfileEvents::DistributedConnectionFailAtAll);
}
}
}
}
if (usable_count < min_entries)
throw DB::NetException(
"All connection tries failed. Log: \n\n" + fail_messages + "\n",
DB::ErrorCodes::ALL_CONNECTION_TRIES_FAILED);
//把为空或者不可用的连接从try_results中剔除
try_results.erase(
std::remove_if(
try_results.begin(), try_results.end(),
[](const TryResult &r) { return r.entry.isNull() || !r.is_usable; }),
try_results.end());
/// Sort so that preferred items are near the beginning.
//根据规则排序: 先根据是否为最新的副本判断, 如果不能比较出结果在根据staleness判断哪个是比较新的
//最新的排在前面, 按数据新旧顺序往后排
std::stable_sort(
try_results.begin(), try_results.end(),
[](const TryResult &left, const TryResult &right) {
return std::forward_as_tuple(!left.is_up_to_date, left.staleness)
< std::forward_as_tuple(!right.is_up_to_date, right.staleness);
});
if (up_to_date_count >= min_entries) {
/// There is enough up-to-date entries.
try_results.resize(up_to_date_count);//只保留前up_to_date_count个, 其余的删掉
} else if (fallback_to_stale_replicas) {
/// There is not enough up-to-date entries but we are allowed to return stale entries.
/// Gather all up-to-date ones and least-bad stale ones.
size_t size = std::min(try_results.size(), max_entries);
try_results.resize(size);
} else
throw DB::Exception(
"Could not find enough connections to up-to-date replicas. Got: " + std::to_string(up_to_date_count)
+ ", needed: " + std::to_string(min_entries),
DB::ErrorCodes::ALL_REPLICAS_ARE_STALE);
return try_results;
}
其中updatePoolStates()也要看一下
1.2.1.1updatePoolStates()
template<typename TNestedPool>
typename PoolWithFailoverBase<TNestedPool>::PoolStates
PoolWithFailoverBase<TNestedPool>::updatePoolStates() {
PoolStates result;
result.reserve(nested_pools.size());
{
std::lock_guard lock(pool_states_mutex);
for (auto &state : shared_pool_states)
state.randomize();//设置random参数
time_t current_time = time(nullptr);
//上次减少错误计数的时间不为空, 即已经有过减少错误计数的操作了
if (last_error_decrease_time) {
time_t delta = current_time - last_error_decrease_time;//一个时间间隔
if (delta >= 0) {
/// Divide error counts by 2 every decrease_error_period seconds. 时间间隔 除以 排错周期 = 个数
//见配置decrease_error_period = 2 * 300s = 600s
size_t shift_amount = delta / decrease_error_period;
/// Update time but don't do it more often than once a period. Else if the function is called often enough, error count will never decrease.
//错误存在的时间大于一个周期时(存在时间可能大于1个周期, 甚至是2个周期...), 更新更新一次last_error_decrease_time. 否则该函数可能被频繁调用, 错误计数将永远不会减少
if (shift_amount)
last_error_decrease_time = current_time;
//此处sizeof(UInt64) * CHAR_BIT = 64, 即如果错误存在时间已经大于等于64个周期(64 * 600s), 需要将错误数清零
if (shift_amount >= sizeof(UInt64) * CHAR_BIT) {//错误存在的时间已经足够长了, 则将错误计数重置为0.
for (auto &state : shared_pool_states)
state.error_count = 0;
} else if (shift_amount) {//错误存在的时间大于一个周期, 但存在的时间还不足够长, 则将错误计数减少, 新的错误数等于原错误数除以2^shift_amount.
for (auto &state : shared_pool_states)
state.error_count >>= shift_amount;//举例: error_count = 32, shift_amount = 2, 移位操作error_count >>= shift_amount后, error_count = 32/(2^2) = 8
}
}
} else//还没有进行过减少错误计数的操作
last_error_decrease_time = current_time;
result.assign(shared_pool_states.begin(), shared_pool_states.end());//将shared_pool_states中的元素赋值到result
}
return result;
}
到这里应该基本上就已经指导要选择哪个副本了。一些解释信息都在注释中了。
1.3 副本之间的leader选择
但是在向复制表中插入数据的时候,副本之间是需要选出一个leader进行merge等操作,这部分代码也先贴在这里吧。
先解释下,StorageReplicatedMergeTree中定义了一些友元类,今天先分析ReplicatedMergeTreeRestartingThread这个,这个类的初始化方法如下:
ReplicatedMergeTreeRestartingThread::ReplicatedMergeTreeRestartingThread(StorageReplicatedMergeTree &storage_)
: storage(storage_),
log_name(storage.database_name + "." + storage.table_name + " (ReplicatedMergeTreeRestartingThread)"),
log(&Logger::get(log_name)), active_node_identifier(generateActiveNodeIdentifier()) {
//zookeeper_session_expiration_check_period=60s, 周期性的检查ZK session会话是否过期, 检查周期是60s
//这里用的单位是毫秒
check_period_ms = storage.settings.zookeeper_session_expiration_check_period.totalSeconds() * 1000;
//比较 检查ZK会话的周期 和 检查副本延迟的周期 这两个时间, 选择较小的那个作为check_period_ms检查周期
/// Periodicity of checking lag of replica.
if (check_period_ms > static_cast<Int64>(storage.settings.check_delay_period) * 1000)
check_period_ms = storage.settings.check_delay_period * 1000;
//根据日志名创建task并运行
task = storage.global_context.getSchedulePool().createTask(log_name, [this] { run(); });
}
run()方法的定义如下:
//run()方法由构造该类的时候调用, 见第49行
void ReplicatedMergeTreeRestartingThread::run() {
if (need_stop)
return;
try {
if (first_time || storage.getZooKeeper()->expired()) {
startup_completed = false;
if (first_time) {
LOG_DEBUG(log, "Activating replica.");
} else {
LOG_WARNING(log, "ZooKeeper session has expired. Switching to a new session.");
bool old_val = false;
if (storage.is_readonly.compare_exchange_strong(old_val, true))
CurrentMetrics::add(CurrentMetrics::ReadonlyReplica);
partialShutdown();
}
if (!startup_completed) {
try {
storage.setZooKeeper(storage.global_context.getZooKeeper());
}
catch (const Coordination::Exception &) {
/// The exception when you try to zookeeper_init usually happens if DNS does not work. We will try to do it again.
tryLogCurrentException(log, __PRETTY_FUNCTION__);
if (first_time)
storage.startup_event.set();
task->scheduleAfter(retry_period_ms);
return;
}
if (!need_stop && !tryStartup()) {
if (first_time)
storage.startup_event.set();
task->scheduleAfter(retry_period_ms);
return;
}
if (first_time)
storage.startup_event.set();
startup_completed = true;
}
if (need_stop)
return;
bool old_val = true;
if (storage.is_readonly.compare_exchange_strong(old_val, false))
CurrentMetrics::sub(CurrentMetrics::ReadonlyReplica);
first_time = false;
}
//check_delay_period=60s, 周期性地计算副本的延迟时间, 并且和其他副本进行比较. 计算周期是60s
time_t current_time = time(nullptr);
//如果当前时间 >= 上次检查的时间 + 检查周期, 表示需要进行新一次的检查
//在检查过程中, 如果本机是leader, 但本机数据的相对延迟大于预定阈值, 则本机放弃leader角色
if (current_time >= prev_time_of_check_delay + static_cast<time_t>(storage.settings.check_delay_period)) {
/// Find out lag of replicas.
time_t absolute_delay = 0;
time_t relative_delay = 0;
//计算本机副本的绝对延迟和相对延迟
storage.getReplicaDelays(absolute_delay, relative_delay);
if (absolute_delay)
LOG_TRACE(log,
"Absolute delay: " << absolute_delay << ". Relative delay: " << relative_delay << ".");
prev_time_of_check_delay = current_time;//将prev_time_of_check_delay更新为当前时间
/// We give up leadership if the relative lag is greater than threshold.
//本机是leader, 并且相对延迟大于预定阈值, 放弃本机的leadership角色
if (storage.is_leader
&& relative_delay > static_cast<time_t>(storage.settings.min_relative_delay_to_yield_leadership)) {
LOG_INFO(log, "Relative replica delay (" << relative_delay << " seconds) is bigger than threshold ("
<< storage.settings.min_relative_delay_to_yield_leadership
<< "). Will yield leadership.");
ProfileEvents::increment(ProfileEvents::ReplicaYieldLeadership);
storage.exitLeaderElection();//退出leader选举
/// NOTE: enterLeaderElection() can throw if node creation in ZK fails.
/// This is bad because we can end up without a leader on any replica.
/// In this case we rely on the fact that the session will expire and we will reconnect.
storage.enterLeaderElection();//再次进入leader选举
}
}
}
catch (...) {
storage.startup_event.set();
tryLogCurrentException(log, __PRETTY_FUNCTION__);
}
task->scheduleAfter(check_period_ms);
}
其中storage.getReplicaDelays(absolute_delay, relative_delay)方法可以重点看下:
void StorageReplicatedMergeTree::getReplicaDelays(time_t & out_absolute_delay, time_t & out_relative_delay)
{
//先确保本机数据不是readonly状态
assertNotReadonly();
time_t current_time = time(nullptr);
/** Relative delay is the maximum difference of absolute delay from any other replica,
* (if this replica lags behind any other live replica, or zero, otherwise).
* Calculated only if the absolute delay is large enough.
*/
/// 绝对延迟是根据本机的日志队列中的日志处理进度计算的;
/// 相对延迟是绝对延迟与其他副本的延迟的最大差值, (其实就是看本地副本是不是落后于其他副本).
/// 如果本地副本落后与其他副本, 则相对延迟不为0, 否则的话相对延迟为0;
/// 相对延迟默认为0, 只有在绝对延迟大于min_relative_delay_to_yield_leadership时才需要计算相对延迟
out_absolute_delay = getAbsoluteDelay();
out_relative_delay = 0;
//当本地副本完全同步时, 相对延迟 = 绝对延迟
//否则当本地副本不是完全同步的且还有未处理的同步任务时, 相对延迟 = 绝对延迟 - 副本的最小延迟
//只有在绝对延迟 大于 min_relative_delay_to_yield_leadership时才需要计算相对延迟
//本机绝对延迟小于等于最大容许的延迟时间, 则本机还可以成为leader, 否则需要计算相对延迟, 再根据相对延迟判断是不是需要本机让出leader角色
//min_relative_delay_to_yield_leadership=120s
if (out_absolute_delay < static_cast<time_t>(settings.min_relative_delay_to_yield_leadership))
return;
auto zookeeper = getZooKeeper();
time_t max_replicas_unprocessed_insert_time = 0;
bool have_replica_with_nothing_unprocessed = false;
Strings replicas = zookeeper->getChildren(zookeeper_path + "/replicas");
for (const auto & replica : replicas)//遍历所有的副本
{
if (replica == replica_name)
continue;
/// Skip dead replicas.
if (!zookeeper->exists(zookeeper_path + "/replicas/" + replica + "/is_active"))
continue;
String value;
if (!zookeeper->tryGet(zookeeper_path + "/replicas/" + replica + "/min_unprocessed_insert_time", value))
continue;
time_t replica_time = value.empty() ? 0 : parse<time_t>(value);
if (replica_time == 0)
{
/** Note
* The conclusion that the replica does not lag may be incorrect,
* because the information about `min_unprocessed_insert_time` is taken
* only from that part of the log that has been moved to the queue.
* If the replica for some reason has stalled `queueUpdatingTask`,
* then `min_unprocessed_insert_time` will be incorrect.
*/
have_replica_with_nothing_unprocessed = true;
break;
}
if (replica_time > max_replicas_unprocessed_insert_time)
max_replicas_unprocessed_insert_time = replica_time;
}
if (have_replica_with_nothing_unprocessed)//当副本完全同步时, 相对延迟 = 绝对延迟
out_relative_delay = out_absolute_delay;
else //否则当副本不是完全同步的, 还有未处理的同步任务时, 相对延迟 = 绝对延迟 - 副本的最小延迟
{
max_replicas_unprocessed_insert_time = std::min(current_time, max_replicas_unprocessed_insert_time);
time_t min_replicas_delay = current_time - max_replicas_unprocessed_insert_time;
if (out_absolute_delay > min_replicas_delay)
out_relative_delay = out_absolute_delay - min_replicas_delay;
}
}
getAbsoluteDelay()方法前面已经说过了,这里就不重复写了,就先这样。