通常在下面两种情况下,会发生数据复制操作:1 由于节点故障导致该节点上的block数据均丢失 ;2 有新增加的节点时,由于新旧节点上的磁盘利用率不平衡时。
1 节点故障
在tfs里面,nameserver会启动心跳线程,定期扫描所有logicblock的状态信息(数据副本版本一致性、副本个数、磁盘空间利用率等)。例如:当一个block的空间利用率小于一个定阈值时,发生数据compact磁盘回收操作;当一个block的副本数小于规定的大小时,会发生数据replication操作。
由于数据节点不可用导致该节点上的block的副本数都变小,而每个block的副本总数量通常为3,这时会发生数据复制操作,代码逻辑如下:
bool BlockManager::need_replicate(ArrayHelper<uint64_t>& servers, PlanPriority& priority, const BlockCollect* block, const time_t now) const
{
bool ret = NULL != block;
if (ret)
{
get_mutex_(block->id()).rdlock();
priority = block->check_replicate(now);
ret = (priority >= PLAN_PRIORITY_NORMAL);
if (ret)
block->get_servers(servers);
get_mutex_(block->id()).unlock();
ret = (ret && !manager_.get_task_manager().exist_block(block->id()));
}
return ret;
}
PlanPriority BlockCollect::check_replicate(const time_t now) const
{
PlanPriority priority = PLAN_PRIORITY_NONE;
if (!is_creating() && !is_in_family() && !in_replicate_queue() && expire(now) && !has_valid_lease(now))
{
if (server_size_ <= 0)
{
TBSYS_LOG(WARN, "block: %"PRI64_PREFIX"u has been lost, do not replicate", info_.block_id_);
}
else
{
if (server_size_ < SYSPARAM_NAMESERVER.max_replication_)
priority = PLAN_PRIORITY_NORMAL;
if (1 == server_size_ && SYSPARAM_NAMESERVER.max_replication_ > 1)
priority = PLAN_PRIORITY_EMERGENCY;
}
}
return priority;
}
2 节点扩容
随着数据规模总量的增加,集群扩容也必不可少了。当有新的数据节点加入时,集群自动检测所有数据节点上的block负载和迁移,来达到节点间的均衡。数据迁移的具体问题描述:哪个数据节点源上的哪些block数据,需要迁移到哪个数据节点目的地上?因此需要解决两个问题:
a. 敲定待迁移数据的源头节点soure和目的节点dest?
首先,nameserver会获取所有数据节点的负载率,算出集群的数据平均负载率;数据节点负载利用率公式: avg_ratio = (use_capacity)/(total_capacity);
然后,随机选择低于平均负载率的数据节点作为目的节点;反之,选择高于平均负载率的数据节点作为源头节点。
b. 敲定源头数据节点上的哪些block_id需要迁移?
首先,计算源头数据节点上的所有block的活跃值;活跃值公式表示:
weights = th.last_statistics_time_ * ar.last_access_time_ratio + th.read_visit_count_ * ar.read_ratio + th.write_visit_count_ * ar.write_ratio +th.update_visit_count_ * ar.update_ratio + th.unlink_visit_count_* ar.unlink_ratio; |
然后,选择活跃值最低的block数据作为待迁移的block。
代码逻辑如下:
void MigrateManager::run_()
{
int64_t index = 0;
const int32_t MAX_SLEEP_TIME = 30;//30s
const int32_t MAX_ARRAY_SIZE = 128;
const int32_t CHECK_COMPLETE_WAIT_TIME = 120;//120s
std::pair<uint64_t, int32_t> array[MAX_ARRAY_SIZE];
common::ArrayHelper<std::pair<uint64_t, int32_t>> helper(MAX_ARRAY_SIZE, array);
migrateserver::MsRuntimeGlobalInformation& mrgi= migrateserver::MsRuntimeGlobalInformation::instance();
while (!mrgi.is_destroyed())
{
helper.clear();
blocks_[0].clear();
blocks_[1].clear();
MigrateEntry entry;
memset(&entry, 0, sizeof(entry));
calc_system_disk_migrate_info_(entry);
if (entry.source_addr_ != INVALID_SERVER_ID
|| entry.dest_addr_ != INVALID_SERVER_ID)
{
get_all_servers_(helper);
for (index = 0; index < helper.get_array_index(); ++index)
{
std::pair<uint64_t, int32_t>* item = helper.at(index);
get_index_header_(item->first, item->second);
}
int32_t ret = choose_migrate_entry_(entry);
if (TFS_SUCCESS == ret)
{
ret = do_migrate_(entry);
}
if (TFS_SUCCESS == ret)
{
Func::sleep(CHECK_COMPLETE_WAIT_TIME, mrgi.is_destroy_);
}
}
Func::sleep(MAX_SLEEP_TIME, mrgi.is_destroy_);
}
}
//a. 敲定源和目的数据节点
void MigrateManager::calc_system_disk_migrate_info_(MigrateEntry& entry) const
{
memset(&entry, 0, sizeof(entry));
int64_t total_capacity = 0, use_capacity = 0;
statistic_all_server_info_(total_capacity, use_capacity);
if (total_capacity > 0 && use_capacity > 0)
{
double avg_ratio = static_cast<double>(use_capacity)/static_cast<double>(total_capacity);
tbutil::Mutex::Lock lock(mutex_);
CONST_SERVER_MAP_ITER iter = servers_.begin();
for (; iter != servers_.end(); ++iter)
{
const common::DataServerStatInfo& info = iter->second;
if (INVALID_SERVER_ID != info.id_ && common::DATASERVER_DISK_TYPE_SYSTEM == info.type_
&& info.total_capacity_ > 0)
{
double curr_ratio = static_cast<double>(info.use_capacity_) / static_cast<double>(info.total_capacity_);
if (curr_ratio < avg_ratio - balance_percent_)
{
entry.dest_addr_ = info.id_;
}
else if ((curr_ratio > (avg_ratio + balance_percent_))
|| curr_ratio >= 1.0)
{
entry.source_addr_ = info.id_;
}
}
}
}
}
//b. 敲定源数据节点上的block_id
int64_t MigrateManager::calc_block_weight_(const common::IndexHeaderV2& info, const int32_t type) const
{
int64_t weights = -1;
const int64_t now = time(NULL);
const AccessRatio &ar = DATASERVER_DISK_TYPE_SYSTEM == type ? system_disk_access_ratio_ : full_disk_access_ratio_;
const ThroughputV2 &th = info.throughput_;
bool calc = common::DATASERVER_DISK_TYPE_SYSTEM == type ? true :
(th.last_statistics_time_ + hot_time_range_ < now && is_full(info.info_));
if (calc)
{
weights = th.last_statistics_time_ * ar.last_access_time_ratio +
th.read_visit_count_ * ar.read_ratio + th.write_visit_count_ * ar.write_ratio +
th.update_visit_count_ * ar.update_ratio + th.unlink_visit_count_* ar.unlink_ratio;
}
return weights;
}
//c. 发送“迁移任务”给数据节点,开始具体的迁移
int MigrateManager::do_migrate_(MigrateEntry& current)
{
char msg[256] = {'\0'};
int32_t ret = (current.block_id_ != INVALID_BLOCK_ID
&& current.source_addr_ != INVALID_SERVER_ID
&& current.dest_addr_ != INVALID_SERVER_ID) ? TFS_SUCCESS : EXIT_PARAMETER_ERROR;
if (TFS_SUCCESS == ret)
{
ClientCmdMessage req_msg;
req_msg.set_value1(current.source_addr_);
req_msg.set_value2(current.dest_addr_);
req_msg.set_value3(current.block_id_);
req_msg.set_value4(REPLICATE_BLOCK_MOVE_FLAG_YES);
req_msg.set_value5(MOVE_BLOCK_NO_CHECK_RACK_FLAG_YES);
req_msg.set_cmd(CLIENT_CMD_IMMEDIATELY_REPL);
int32_t retry_times = 3;
const int32_t TIMEOUT_MS = 2000;
do
{
NewClient* client = NewClientManager::get_instance().create_client();
ret = (NULL != client) ? TFS_SUCCESS : EXIT_CLIENT_MANAGER_CREATE_CLIENT_ERROR;
if (TFS_SUCCESS == ret)
{
tbnet::Packet* result = NULL;
ret = send_msg_to_server(ns_vip_port_, client, &req_msg, result, TIMEOUT_MS);
if (TFS_SUCCESS == ret)
{
ret = STATUS_MESSAGE == result->getPCode() ? TFS_SUCCESS : EXIT_SEND_MIGRATE_MSG_ERROR;
}
if (TFS_SUCCESS == ret)
{
StatusMessage* rsp = dynamic_cast<StatusMessage*>(result);
int32_t len = std::min(static_cast<int32_t>(rsp->get_error_msg_length()), 256);
len = std::max(0, len);
strncpy(msg, rsp->get_error(), len);
ret = STATUS_MESSAGE_OK == rsp->get_status() ? TFS_SUCCESS : EXIT_SEND_MIGRATE_MSG_ERROR;
}
}
NewClientManager::get_instance().destroy_client(client);
}
while (retry_times-- > 0 && TFS_SUCCESS != ret);
}
return ret;
}