1. tfs节点的管理类介绍
1.1 数据节点
数据节点主要进行进行实际数据的存储与读写,其管理类是DataServer,其职责如下:
相关数据流的任务函数处理由handlePacketQueue函数完成,如下:
//一个数据节点
class DataService
{
OpManager op_manager_;
LeaseManager *lease_manager_;
DataHelper data_helper_;
TaskManager task_manager_;
BlockManager *block_manager_;
TrafficControl traffic_control_;
ClientRequestServer client_request_server_;
WritableBlockManager writable_block_manager_;
CheckManager check_manager_;
SyncManager* sync_manager_;
MigrateManager* migrate_manager_;
TimeoutThreadHelperPtr timeout_thread_;
RunTaskThreadHelperPtr task_thread_;
RunCheckThreadHelperPtr check_thread_;
}
bool DataService::handlePacketQueue(tbnet::Packet* packet, void* args)
{
bool bret = BaseService::handlePacketQueue(packet, args);
if (bret)
{
int32_t pcode = packet->getPCode();
int32_t ret = LOCAL_PACKET == pcode ? TFS_ERROR : TFS_SUCCESS;
if (TFS_SUCCESS == ret)
{
switch (pcode)
{
case LIST_BLOCK_MESSAGE:
ret = list_blocks(dynamic_cast<ListBlockMessage*>(packet));
break;
case REPLICATE_BLOCK_MESSAGE:
case COMPACT_BLOCK_MESSAGE:
case DS_COMPACT_BLOCK_MESSAGE:
case DS_REPLICATE_BLOCK_MESSAGE:
case RESP_DS_COMPACT_BLOCK_MESSAGE:
case RESP_DS_REPLICATE_BLOCK_MESSAGE:
case REQ_EC_MARSHALLING_MESSAGE:
case REQ_EC_REINSTATE_MESSAGE:
case REQ_EC_DISSOLVE_MESSAGE:
case NS_REQ_RESOLVE_BLOCK_VERSION_CONFLICT_MESSAGE:
ret = task_manager_.handle(dynamic_cast<BaseTaskMessage*>(packet));
break;
case GET_BLOCK_INFO_MESSAGE_V2:
ret = get_block_info(dynamic_cast<GetBlockInfoMessageV2*>(packet));
break;
case GET_SERVER_STATUS_MESSAGE:
ret = get_server_status(dynamic_cast<GetServerStatusMessage*>(packet));
break;
case STATUS_MESSAGE:
ret = get_ping_status(dynamic_cast<StatusMessage*>(packet));
break;
case CLIENT_CMD_MESSAGE:
ret = client_command(dynamic_cast<ClientCmdMessage*>(packet));
break;
case REQ_CALL_DS_REPORT_BLOCK_MESSAGE:
case STAT_FILE_MESSAGE_V2:
case READ_FILE_MESSAGE_V2:
case WRITE_FILE_MESSAGE_V2:
case CLOSE_FILE_MESSAGE_V2:
case UNLINK_FILE_MESSAGE_V2:
case NEW_BLOCK_MESSAGE_V2:
case REMOVE_BLOCK_MESSAGE_V2:
case READ_RAWDATA_MESSAGE_V2:
case WRITE_RAWDATA_MESSAGE_V2:
case READ_INDEX_MESSAGE_V2:
case WRITE_INDEX_MESSAGE_V2:
case QUERY_EC_META_MESSAGE:
case COMMIT_EC_META_MESSAGE:
case GET_ALL_BLOCKS_HEADER_MESSAGE:
ret = client_request_server_.handle(packet);
break;
case REQ_CHECK_BLOCK_MESSAGE:
case REPORT_CHECK_BLOCK_MESSAGE:
ret = check_manager_.handle(packet);
break;
default:
TBSYS_LOG(ERROR, "process packet pcode: %d\n", pcode);
ret = TFS_ERROR;
break;
}
if (common::TFS_SUCCESS != ret)
{
common::BasePacket* msg = dynamic_cast<common::BasePacket*>(packet);
msg->reply_error_packet(TBSYS_LOG_LEVEL(ERROR), ret, "execute message failed");
}
}
}
return bret;
}
其中的主要职责由task_manager_和client_request_server_完成:
a. task_manager_ 负责执行数据节点的均衡、数据迁移、清理压缩、整理等操作,满足系统集群的平衡迁移、复制容错、节点扩容等需求;
b. client_request_server_ 负责执行数据节点上文件的读、写、查询、删除等操作,直接响应客户端的请求。相应的代码如下:
int TaskManager::handle(BaseTaskMessage* packet)
{
int pcode = packet->getPCode();
int ret = TFS_SUCCESS;
switch(pcode)
{
case REPLICATE_BLOCK_MESSAGE:
ret = add_replicate_task(dynamic_cast<ReplicateBlockMessage*>(packet));
break;
case COMPACT_BLOCK_MESSAGE:
ret = add_compact_task(dynamic_cast<NsRequestCompactBlockMessage*>(packet));
break;
case DS_REPLICATE_BLOCK_MESSAGE:
ret = add_ds_replicate_task(dynamic_cast<DsReplicateBlockMessage*>(packet));
break;
case DS_COMPACT_BLOCK_MESSAGE:
ret = add_ds_compact_task(dynamic_cast<DsCompactBlockMessage*>(packet));
break;
case REQ_EC_MARSHALLING_MESSAGE:
ret = add_marshalling_task(dynamic_cast<ECMarshallingMessage*>(packet));
break;
case REQ_EC_REINSTATE_MESSAGE:
ret = add_reinstate_task(dynamic_cast<ECReinstateMessage*>(packet));
break;
case REQ_EC_DISSOLVE_MESSAGE:
ret = add_dissolve_task(dynamic_cast<ECDissolveMessage*>(packet));
break;
case NS_REQ_RESOLVE_BLOCK_VERSION_CONFLICT_MESSAGE:
ret = add_resolve_conflict_task(dynamic_cast<NsReqResolveBlockVersionConflictMessage*>(packet));
return ret;
case RESP_DS_REPLICATE_BLOCK_MESSAGE:
case RESP_DS_COMPACT_BLOCK_MESSAGE:
ret = handle_complete(packet);
break;
default:
ret = TFS_ERROR;
TBSYS_LOG(WARN, "unknown pcode : %d", pcode);
break;
}
if (TFS_SUCCESS == ret)
{
packet->reply(new StatusMessage(STATUS_MESSAGE_OK));
}
return ret;
}
int ClientRequestServer::handle(tbnet::Packet* packet)
{
int ret = (NULL == packet) ? EXIT_POINTER_NULL : TFS_SUCCESS;
if (TFS_SUCCESS == ret)
{
int32_t pcode = packet->getPCode();
switch (pcode)
{
case REQ_CALL_DS_REPORT_BLOCK_MESSAGE:
ret = report_block(dynamic_cast<CallDsReportBlockRequestMessage*>(packet));
break;
case STAT_FILE_MESSAGE_V2:
ret = stat_file(dynamic_cast<StatFileMessageV2*>(packet));
break;
case READ_FILE_MESSAGE_V2:
ret = read_file(dynamic_cast<ReadFileMessageV2*>(packet));
break;
case WRITE_FILE_MESSAGE_V2:
ret = write_file(dynamic_cast<WriteFileMessageV2*>(packet));
break;
case CLOSE_FILE_MESSAGE_V2:
ret = close_file(dynamic_cast<CloseFileMessageV2*>(packet));
break;
case UNLINK_FILE_MESSAGE_V2:
ret = unlink_file(dynamic_cast<UnlinkFileMessageV2*>(packet));
break;
case NEW_BLOCK_MESSAGE_V2:
ret = new_block(dynamic_cast<NewBlockMessageV2*>(packet));
break;
case REMOVE_BLOCK_MESSAGE_V2:
ret = remove_block(dynamic_cast<RemoveBlockMessageV2*>(packet));
break;
case READ_RAWDATA_MESSAGE_V2:
ret = read_raw_data(dynamic_cast<ReadRawdataMessageV2*>(packet));
break;
case WRITE_RAWDATA_MESSAGE_V2:
ret = write_raw_data(dynamic_cast<WriteRawdataMessageV2*>(packet));
break;
case READ_INDEX_MESSAGE_V2:
ret = read_index(dynamic_cast<ReadIndexMessageV2*>(packet));
break;
case WRITE_INDEX_MESSAGE_V2:
ret = write_index(dynamic_cast<WriteIndexMessageV2*>(packet));
break;
case QUERY_EC_META_MESSAGE:
ret = query_ec_meta(dynamic_cast<QueryEcMetaMessage*>(packet));
break;
case COMMIT_EC_META_MESSAGE:
ret = commit_ec_meta(dynamic_cast<CommitEcMetaMessage*>(packet));
break;
case GET_ALL_BLOCKS_HEADER_MESSAGE:
ret = get_all_blocks_header(dynamic_cast<GetAllBlocksHeaderMessage*>(packet));
break;
default:
TBSYS_LOG(WARN, "process packet pcode: %d\n", pcode);
ret = EXIT_UNKNOWN_MSGTYPE;
break;
}
}
return ret;
}
1.2 主控节点
主控节点主要职责分为三块:
a. 各个控制流的任务管理与下发,如Block的创建,删除,复制,均衡,整理;
b. 各个数据节点健康状态的检查;
c. 元数据及元数据与各个数据节点上block的检索管理。
其管理类是NameServer,负责的相关任务同样由handlePacketQueue函数处理,如下:
class NameServer
{
LayoutManager layout_manager_;
NameServerHeartManager master_slave_heart_manager_;
HeartManagement heart_manager_;
}
class LayoutManager
{
BlockManager block_manager_;
ServerManager server_manager_;
TaskManager task_manager_;
OpLogSyncManager oplog_sync_mgr_;
FamilyManager family_manager_;
ClientRequestServer client_request_server_;
common::GCObjectManager<LayoutManager, common::BaseObject> gc_manager_;
}
bool NameServer::handlePacketQueue(tbnet::Packet *packet, void *args)
{
bool bret = BaseService::handlePacketQueue(packet, args);
if (bret)
{
int32_t pcode = packet->getPCode();
int32_t ret = LOCAL_PACKET == pcode ? TFS_ERROR : common::TFS_SUCCESS;
if (TFS_SUCCESS == ret)
{
//TBSYS_LOG(DEBUG, "PCODE: %d", pcode);
common::BasePacket* msg = dynamic_cast<common::BasePacket*>(packet);
switch (pcode)
{
case GET_BLOCK_INFO_MESSAGE_V2:
ret = open(msg);
break;
case BATCH_GET_BLOCK_INFO_MESSAGE_V2:
ret = batch_open(msg);
break;
case REPLICATE_BLOCK_MESSAGE:
case BLOCK_COMPACT_COMPLETE_MESSAGE:
case REQ_EC_MARSHALLING_COMMIT_MESSAGE:
case REQ_EC_REINSTATE_COMMIT_MESSAGE:
case REQ_EC_DISSOLVE_COMMIT_MESSAGE:
ret = layout_manager_.get_client_request_server().handle(msg);
break;
case SHOW_SERVER_INFORMATION_MESSAGE:
ret = show_server_information(msg);
break;
case STATUS_MESSAGE:
ret = ping(msg);
break;
case DUMP_PLAN_MESSAGE:
ret = dump_plan(msg);
break;
case CLIENT_NS_KEEPALIVE_MESSAGE:
ret = client_keepalive(msg);
break;
case CLIENT_CMD_MESSAGE:
ret = client_control_cmd(msg);
break;
case REQ_RESOLVE_BLOCK_VERSION_CONFLICT_MESSAGE:
{
BaseTaskMessage* packet = dynamic_cast<BaseTaskMessage*>(msg);
if (0 == packet->get_seqno())
ret = resolve_block_version_conflict(msg);
else
ret = layout_manager_.get_client_request_server().handle(msg);
}
break;
case REQ_GET_FAMILY_INFO_MESSAGE:
ret = get_family_info(msg);
break;
case REPAIR_BLOCK_MESSAGE_V2:
ret = repair(msg);
break;
case DS_APPLY_BLOCK_MESSAGE:
ret = apply_block(msg);
break;
case DS_APPLY_BLOCK_FOR_UPDATE_MESSAGE:
ret = apply_block_for_update(msg);
break;
case DS_GIVEUP_BLOCK_MESSAGE:
ret = giveup_block(msg);
break;
default:
ret = EXIT_UNKNOWN_MSGTYPE;
TBSYS_LOG(WARN, "unknown msg type: %d", pcode);
break;
}
if (common::TFS_SUCCESS != ret)
{
msg->reply_error_packet(TBSYS_LOG_LEVEL(ERROR), ret, "execute message failed, pcode: %d", pcode);
}
}
}
return bret;
}
2. 文件物理结构与逻辑结构设计介绍
2.1 文件存储设计
文件存储设计官方相关公布图如下:
在tfs系统里面,个人理解数据存储设计概念分为三层:
a. 应用层:用户最关心的业务数据文件,如图片、文档等file_id。
b. 逻辑层:开发人员最关心的,用LogicBlock表示。
c. 物理层:运维人员最关心的,用BasePhysicalBlock表示。
Block是逻辑上的概念,一个block包含多个bucket,每个bucket包含多个slot,每个slot一一对应的file_id, file_id在block内唯一。BasePhysicalBlock是物理磁盘上的概念,一个block也包含一个或多个BasePhysicalBlock,与磁盘的碎片程度有关。
2.2 关于Block
每个block_id标示着一个Block,tfs逻辑上是以Block的方式管理文件内容的,block_id_集群中是全局唯一的,由nameserver全局唯一生成与管理,file_id_在每个block中是局部唯一的,所以通过二元祖<block_id_,file_id_>可以定位集群中用户所需要的数据。
另外,每个block的大小基本配置固定的,例如默认64M。由于tfs与HDFS的需求不同,主要管理存储小文件的,所以数据文件偏小,但当满足大数据文件的存储需求时,也会将其拆分成多个小文件FileSegment。所以读取文件数据时数据检索流程:FileSegment-->LogicBlock-->FileInfoV2-->BasePhysicalBlock
2.3 结构表达
各个相应的类结构表达如下:
//用户请求上传信息
struct FileSegment
{
uint64_t block_id_;
uint64_t file_id_;
int32_t offset_;
int32_t length_;
};
struct BlockIndex
{
uint64_t logic_block_id_;
int32_t physical_block_id_:20;//<=1048575
int32_t physical_file_name_id_:20;//<=1048575 number 1~1048575
int32_t next_index_:20;//<=1048575
int32_t prev_index_:20;//<=1048575
int8_t index_:7;// 0 ~36(0: main block, 1~36: ext block)
int8_t status_:2;//0: uncomplete, 1: complete
int8_t split_flag_:2;//0: unsplit, 1:split
int8_t split_status_:2;//0: split uncomplete, 1: split complete
int8_t reserve_:3;//reserve
};
struct FileInfoV2 //30
{
uint64_t id_; //file id
int32_t offset_; //offset in block file
int32_t size_:28;// file size
int8_t status_:4;//delete flag
uint32_t crc_; // checksum
int32_t modify_time_;//modify time
int32_t create_time_; // create time
uint16_t next_; //next index
}
struct IndexHeaderV2
{
common::BlockInfoV2 info_;//56
ThroughputV2 throughput_;//72
int32_t used_offset_;//12 * 4 = 48
int32_t avail_offset_;
int32_t marshalling_offset_;
uint32_t seq_no_;
union
{
uint16_t file_info_bucket_size_;
uint16_t index_num_;
};
uint16_t used_file_info_bucket_size_;
int8_t max_index_num_;
int8_t reserve_[27];
}
class BasePhysicalBlock : public GCObject
{
public:
int32_t physical_block_id_;
int32_t start_;//the data start offset of this block file
int32_t end_;//the data end offset of this block file
FileOperation file_op_(path);
};
class LogicBlock
{
BaseIndexHandle* index_handle_;
DataHandle data_handle_;
std::vector<PhysicalBlock*> physical_block_list_; //the physical block list of this logic block
}
class LogicBlockManager
{
common::TfsSortedVector<BaseLogicBlock*, LogicBlockIdCompare> logic_blocks_;
}
class PhysicalBlockManager
{
common::TfsSortedVector<BasePhysicalBlock*,PhysicalBlockIdCompare> physical_blocks_;
common::TfsSortedVector<BasePhysicalBlock*,PhysicalBlockIdCompare> alloc_physical_blocks_;
}
class BlockManager
{
LogicBlockManager logic_block_manager_;
PhysicalBlockManager physical_block_manager_;
GCObjectManager gc_manager_;
mutable common::RWLock mutex_;
}
3. 读取流程
3.1 客户端
int64_t TfsClientImplV2::read(const int fd, void* buf, const int64_t count)
{
int64_t ret = TFS_SUCCESS;
if ((fd < 0) || (NULL == buf) || (count < 0))
{
ret = EXIT_PARAMETER_ERROR;
}
else
{
TfsFile* tfs_file = get_file(fd);
if (NULL == tfs_file)
{
ret = EXIT_INVALIDFD_ERROR;
}
else
{
ret = tfs_file->read(buf, count);
tfs_file->get_session()->update_stat(ST_READ, ret > 0);
}
}
return ret;
}
uint64_t File::get_read_ds() const
{
uint64_t server_id = 0;
if (ds_.size() > 0)
{
server_id = ds_[read_index_%ds_.size()];
}
}
3.2 服务端
//读取数据
int ClientRequestServer::read_file(ReadFileMessageV2* message)
{
TIMER_START();
uint64_t block_id = message->get_block_id();
uint64_t attach_block_id = message->get_attach_block_id();
uint64_t file_id = message->get_file_id();
int32_t length = message->get_length();
int32_t offset = message->get_offset();
int8_t flag = message->get_flag();
uint64_t peer_id = message->get_connection()->getPeerId();
const FamilyInfoExt& family_info = message->get_family_info();
int ret = ((INVALID_BLOCK_ID == block_id) || (INVALID_FILE_ID == file_id) ||
(offset < 0) || (length <= 0)) ? EXIT_PARAMETER_ERROR : TFS_SUCCESS;
if (TFS_SUCCESS == ret)
{
FileInfoV2 file_info;
file_info.id_ = file_id;
if (INVALID_FAMILY_ID == family_info.family_id_)
{
ret = get_block_manager().stat(file_info,
FORCE_STAT, block_id, message->get_attach_block_id());
}
else
{
ret = get_data_helper().stat_file_degrade(block_id,
file_id, FORCE_STAT, family_info, file_info);
}
if (TFS_SUCCESS == ret)
{
// truncate read length
if (offset + length > file_info.size_)
{
length = file_info.size_ - offset;
}
ret = (length < 0) ? EXIT_PARAMETER_ERROR: TFS_SUCCESS;
if (TFS_SUCCESS == ret)
{
ReadFileRespMessageV2* resp_msg = new (std::nothrow) ReadFileRespMessageV2();
assert(NULL != message);
// if length is truncated to 0
// reply a packet with length 0 to tell client that it already reach to the end of file
if (0 == length)
{
resp_msg->set_length(0);
}
else
{
char* buffer = resp_msg->alloc_data(length);
assert(NULL != buffer);
if (INVALID_FAMILY_ID == family_info.family_id_)
{
TimeStat timer;
timer.start();
ret = get_block_manager().read(buffer,
length, offset, file_id, flag, block_id, attach_block_id);
timer.end();
ret = (ret < 0) ? ret: TFS_SUCCESS;
// log slow read request
if (TFS_SUCCESS == ret && timer.duration() > SYSPARAM_DATASERVER.max_io_warn_time_)
{
TBSYS_LOG(WARN, "slow read request. blockid: %"PRI64_PREFIX"u, "
"attach_blockid: %"PRI64_PREFIX"u, fileid: %"PRI64_PREFIX"u, cost: %"PRI64_PREFIX"d",
attach_block_id, block_id, file_id, timer.duration());
}
}
else
{
ret = get_data_helper().read_file_degrade(block_id,
file_info, buffer, length, offset, flag, family_info);
}
}
if (TFS_SUCCESS != ret)
{
// upper layer will reply error packet to client
tbsys::gDelete(resp_msg);
}
else
{
// readv2 support
if (flag & READ_DATA_OPTION_WITH_FINFO)
{
resp_msg->set_file_info(file_info);
}
ret = message->reply(resp_msg);
if (TFS_SUCCESS == ret)
{
get_traffic_control().rw_traffic_stat(false, length);
}
}
}
}
}
TIMER_END();
return ret;
}
// b BlockManager读取buf,从logic_block_id
int BlockManager::read(char* buf, int32_t& nbytes, const int32_t offset,
const uint64_t fileid, const int8_t flag, const uint64_t logic_block_id, const uint64_t attach_logic_block_id)
{
int32_t ret = (NULL != buf && nbytes > 0 && offset >= 0 && INVALID_FILE_ID != fileid && flag >= 0
&& INVALID_BLOCK_ID != logic_block_id && INVALID_BLOCK_ID != attach_logic_block_id) ? TFS_SUCCESS : EXIT_PARAMETER_ERROR;
if (TFS_SUCCESS == ret)
{
BaseLogicBlock* logic_block = get(logic_block_id);
ret = (NULL != logic_block) ? TFS_SUCCESS : EXIT_NO_LOGICBLOCK_ERROR;
if (TFS_SUCCESS == ret)
{
ret = logic_block->read(buf, nbytes, offset, fileid, flag, attach_logic_block_id);
}
}
return ret;
}
//c 从physical_block读取真实数据,一个logic_block可能包含多个physic_blocks
int DataHandle::pread(char* buf, const int32_t nbytes, const int32_t offset)
{
int32_t ret = (NULL != buf && nbytes > 0 && offset >= 0) ? TFS_SUCCESS : EXIT_PARAMETER_ERROR;
if (TFS_SUCCESS == ret)
{
PhysicalBlock* physical_block = NULL;
int32_t inner_offset = 0, length = nbytes, current_offset = offset;
int32_t inner_length = 0, mem_offset = 0, total_read_length = 0;
while ((TFS_SUCCESS == ret) && (current_offset < (offset + nbytes)))
{
inner_length = length;
ret = logic_block_.choose_physic_block(physical_block, inner_length, inner_offset, current_offset);
if (TFS_SUCCESS == ret)
{
length = std::min(length, inner_length);
ret = physical_block->pread((buf + mem_offset), length, inner_offset);
ret = ret >= 0 ? TFS_SUCCESS : ret;
if (TFS_SUCCESS == ret)
{
current_offset += length;
mem_offset += length;
total_read_length += length;
length = nbytes - total_read_length;
}
}
}
}
return TFS_SUCCESS == ret ? nbytes : ret;
}
4. 写流程
4.1 客户端
// client 写数据
int TfsFile::write_ex(const char* buf, int64_t count)
{
int ret = TFS_SUCCESS;
uint64_t server = 0;
tbnet::Packet* resp_msg = NULL;
NewClient* client = NewClientManager::get_instance().create_client();
if (NULL == client)
{
ret = EXIT_CLIENT_MANAGER_CREATE_CLIENT_ERROR;
TBSYS_LOG(WARN, "create new client fail.");
}
else
{
WriteFileMessageV2 msg;
msg.set_block_id(fsname_.get_block_id());
msg.set_attach_block_id(fsname_.get_block_id());
msg.set_file_id(fsname_.get_file_id());
msg.set_offset(file_.offset_);
msg.set_length(count);
msg.set_lease_id(file_.lease_id_);
msg.set_master_id(file_.get_write_ds());
msg.set_version(file_.version_);
msg.set_flag(file_.opt_flag_);
msg.set_ds(file_.ds_);
msg.set_data(buf);
if (file_.has_family())
{
msg.set_family_info(file_.family_info_);
}
server = file_.get_write_ds();
ret = send_msg_to_server(server, client, &msg, resp_msg, ClientConfig::wait_timeout_);
}
}
uint64_t File::get_write_ds() const
{
uint64_t server_id = 0;
if (ds_.size() > 0)
{
server_id = ds_[0];
}
return server_id;
}
4.2 服务端
a. NameServer逻辑过程
BlockCollect* LayoutManager::add_new_block_helper_create_by_system_()
{
BlockCollect* block = NULL;
int32_t ret = (0 == block_id) ? TFS_SUCCESS : EXIT_PARAMETER_ERROR;
if (TFS_SUCCESS == ret)
{
uint64_t result[MAX_REPLICATION_NUM];
ArrayHelper<uint64_t> helper(MAX_REPLICATION_NUM, result);
uint64_t news[MAX_REPLICATION_NUM];
ArrayHelper<uint64_t> news_helper(MAX_REPLICATION_NUM, news);
if (NULL != server)
helper.push_back(server->id());
block_id = get_alive_block_id_(false);
ret = (INVALID_BLOCK_ID == block_id) ? EXIT_BLOCK_ID_INVALID_ERROR : TFS_SUCCESS;
if (TFS_SUCCESS == ret)
{
//add block collect object
block = get_block_manager().insert(block_id, now, true);
ret = (NULL != block) ? TFS_SUCCESS : EXIT_NO_BLOCK;
}
if (TFS_SUCCESS == ret)
{
int32_t count = SYSPARAM_NAMESERVER.max_replication_ - helper.get_array_index();
if (count > 0)
{
get_server_manager().choose_create_block_target_server(helper, news_helper, count);
}
BlockCollect* pobject = NULL;
ret = !helper.empty() ? TFS_SUCCESS : EXIT_CHOOSE_CREATE_BLOCK_TARGET_SERVER_ERROR;
if (TFS_SUCCESS == ret)//add block collect object successful
{
ret = add_new_block_helper_send_msg_(block_id, helper);
if (TFS_SUCCESS == ret)
{
//build relation
ret = add_new_block_helper_build_relation_(block, helper, now);
if (TFS_SUCCESS == ret)
{
add_new_block_helper_write_log_(block_id, helper, now);
}
}//end send message to dataserver successful
else
{
get_block_manager().remove(pobject, block_id);//rollback
}
}
else
{
get_block_manager().remove(pobject, block_id);//rollback
}
get_gc_manager().insert(pobject, now);
}
}//end if (TFS_SUCCESS == ret) check parameter
return TFS_SUCCESS == ret ? block : NULL;
}
//a.生成block_id
uint64_t BlockIdFactory::generation(const bool verify)
{
mutex_.lock();
++count_;
uint64_t id = ++global_id_;
assert(id <= MAX_BLOCK_ID);
bool flush_flag = false;
if (count_ >= SKIP_BLOCK_NUMBER)
{
flush_flag = true;
count_ = 0;
}
mutex_.unlock();
int32_t ret = common::TFS_SUCCESS;
if (flush_flag)
{
ret = flush_(id);
if (common::TFS_SUCCESS != ret)
{
TBSYS_LOG(WARN, "update global block id failed, id: %"PRI64_PREFIX"u, ret: %d", id, ret);
}
}
if (common::TFS_SUCCESS == ret)
{
if (verify)
id |= 0x8000000000000000;
}
return id;
}
//b.选择数据节点
int ServerManager::choose_create_block_target_server(common::ArrayHelper<uint64_t>& result,
common::ArrayHelper<uint64_t>& news, const int32_t count) const
{
news.clear();
std::set<uint32_t> lans;
get_lans_(lans, result);
ServerCollect* pserver = NULL;
int32_t index = count;
while (index-- > 0)
{
pserver = NULL;
if (TFS_SUCCESS == choose_replciate_random_choose_server_base_lock_(pserver, result, lans))
{
assert(NULL != pserver);
news.push_back(pserver->id());
result.push_back(pserver->id());
uint32_t lan = Func::get_lan(pserver->id(), SYSPARAM_NAMESERVER.group_mask_);
lans.insert(lan);
}
}
return count - news.get_array_index();
}
int ServerManager::choose_replciate_random_choose_server_base_lock_(ServerCollect*& result,
const common::ArrayHelper<uint64_t>& except, const std::set<uint32_t>& lans) const
{
result = NULL;
RWLock::Lock lock(rwmutex_, READ_LOCKER);
int64_t size = std::min(servers_.size(), SYSPARAM_NAMESERVER.choose_target_server_random_max_nums_);
int64_t index = size, random_index = 0;
while (index-- > 0 && NULL == result)
{
random_index = random() % servers_.size();
ServerCollect* pserver = servers_.at(random_index);
assert(NULL != pserver);
bool valid = ((!pserver->is_full()) && (!except.exist(pserver->id()))
&& (DATASERVER_DISK_TYPE_FULL == pserver->get_disk_type()));
if (valid && !lans.empty())
{
uint32_t lan = Func::get_lan(pserver->id(), SYSPARAM_NAMESERVER.group_mask_);
valid = lans.find(lan) == lans.end();
}
if (valid)
{
result = pserver;
}
}
return (NULL != result) ? TFS_SUCCESS : EXIT_NO_DATASERVER;
}
b. 数据节点逻辑过程
//服务端的响应分为三步
1. prepare_op 校验lease_id合法性、准备block空间等;
2. forward_op 异步发送数据至从副本节点data_slaves、同时写入本主副本节点上;
3. write_file_callback 核实各个副本数据节点的写入成功与否、校验各个副本数据节点的版本一致性、返回写入结果消息。
int ClientRequestServer::write_file(WriteFileMessageV2* message)
{
TIMER_START();
uint64_t block_id = message->get_block_id();
uint64_t attach_block_id = message->get_attach_block_id();
uint64_t file_id = message->get_file_id();
uint64_t lease_id = message->get_lease_id();
int32_t offset = message->get_offset();
int32_t length = message->get_length();
VUINT64 servers = message->get_ds(); // will copy vector
const char* data = message->get_data();
uint64_t master_id = message->get_master_id();
uint64_t peer_id = message->get_connection()->getPeerId();
int32_t flag = message->get_flag();
const FamilyInfoExt& family_info = message->get_family_info();
int64_t family_id = family_info.family_id_;
DsRuntimeGlobalInformation& ds_info = DsRuntimeGlobalInformation::instance();
bool is_master = (master_id == ds_info.information_.id_);
int32_t version = is_master ? -1 : message->get_version(); // master won't check version
bool prepare_ok = false;
int ret = TFS_SUCCESS;
if ((NULL == data) || (offset < 0) || (length <= 0))
{
ret = EXIT_PARAMETER_ERROR;
}
// tbnet already receive this packet from network
get_traffic_control().rw_traffic_stat(true, length);
if (TFS_SUCCESS == ret)
{
if (is_master && INVALID_LEASE_ID == lease_id)
{
// first write to master
ret = get_op_manager().prepare_op(attach_block_id,
file_id, lease_id, OP_TYPE_WRITE, is_master, family_info, servers);
if (TFS_SUCCESS == ret)
{
// callback & slave will use
if (INVALID_BLOCK_ID == block_id) // data block
{
block_id = attach_block_id;
}
message->set_block_id(block_id);
message->set_attach_block_id(attach_block_id);
message->set_file_id(file_id);
message->set_lease_id(lease_id);
message->set_flag(TFS_FILE_FIRST_WRITE_TO_SLAVE);
}
}
else if (!is_master && (flag & TFS_FILE_FIRST_WRITE_TO_SLAVE))
{
// first write to slave
ret = get_op_manager().prepare_op(attach_block_id,
file_id, lease_id, OP_TYPE_WRITE, is_master, family_info, servers);
}
else
{
// not the first wirte, just reset operation
ret = get_op_manager().reset_op(attach_block_id, file_id, lease_id, servers);
}
// async op prepare work finished
prepare_ok = (TFS_SUCCESS == ret);
}
// post request to slaves
if ((TFS_SUCCESS == ret) && (servers.size() > 1U) && is_master)
{
ret = get_op_manager().forward_op(message, attach_block_id, family_id, servers);
}
// local write file
BlockInfoV2 local;
if (TFS_SUCCESS == ret)
{
ret = get_op_manager().write_file(block_id, attach_block_id,
file_id, lease_id, data, length, offset, version, local);
get_op_manager().update_op(attach_block_id, file_id, lease_id, ret, local);
}
// master check if all successful
// slave response to master
if (is_master)
{
if (prepare_ok)
{
write_file_callback(message);
}
else
{
message->reply(new StatusMessage(ret, "master prepare op fail"));
}
}
else
{
SlaveDsRespMessage* resp_msg = new (std::nothrow) SlaveDsRespMessage();
assert(NULL != resp_msg);
resp_msg->set_server_id(ds_info.information_.id_);
resp_msg->set_block_info(local);
resp_msg->set_status(ret);
message->reply(resp_msg);
// slave write fail, release op
if (TFS_SUCCESS != ret)
{
get_op_manager().release_op(attach_block_id, file_id, lease_id);
}
}
TIMER_END();
TBSYS_LOG_DW(ret, "write file %s, blockid: %"PRI64_PREFIX"u, attach_blockid: %"PRI64_PREFIX"u, "
"fileid: %"PRI64_PREFIX"u, leaseid: %"PRI64_PREFIX"u, length: %d, offset: %d, "
"version: %d, role: %s, peer ip: %s, cost: %"PRI64_PREFIX"d, ret: %d",
(TFS_SUCCESS == ret) ? "success" : "fail", block_id, attach_block_id, file_id, lease_id,
length, offset, version, is_master ? "master": "slave",
tbsys::CNetUtil::addrToString(peer_id).c_str(), TIMER_DURATION(), ret);
return TFS_SUCCESS;
}
int ClientRequestServer::write_file_callback(WriteFileMessageV2* message)
{
uint64_t attach_block_id = message->get_attach_block_id();
uint64_t file_id = message->get_file_id();
uint64_t lease_id = message->get_lease_id();
uint32_t length = message->get_length();
uint32_t offset = message->get_offset();
uint64_t peer_id = message->get_connection()->getPeerId();
OpStat op_stat;
int ret = TFS_SUCCESS;
bool all_finish = get_op_manager().check_op(attach_block_id, file_id, lease_id, op_stat);
if (all_finish)
{
ret = op_stat.status_;
if (TFS_SUCCESS != ret)
{
// req ns resolve version conflict
if (EXIT_BLOCK_VERSION_CONFLICT_ERROR == ret)
{
get_op_manager().resolve_block_version_conflict(attach_block_id, file_id, lease_id);
}
message->reply_error_packet(TBSYS_LOG_LEVEL(WARN), ret, op_stat.error_.str().c_str());
// if fail, close will never happen, release op, expire writable block
get_op_manager().release_op(attach_block_id, file_id, lease_id, ret);
}
else
{
WriteFileRespMessageV2* resp_msg = new (std::nothrow) WriteFileRespMessageV2();
assert(NULL != resp_msg);
resp_msg->set_block_id(attach_block_id);
resp_msg->set_file_id(file_id);
resp_msg->set_lease_id(lease_id);
message->reply(resp_msg);
}
if (TFS_SUCCESS != ret)
{
get_traffic_control().rw_stat(RW_STAT_TYPE_WRITE, ret, 0 == offset, length);
}
TBSYS_LOG_IW(ret, "WRITE file %s, ret: %d. blockid: %"PRI64_PREFIX"u, "
"fileid: %"PRI64_PREFIX"u, leaseid: %"PRI64_PREFIX"u, "
"length: %d, offset: %d, peer ip: %s, cost: %"PRI64_PREFIX"d",
(TFS_SUCCESS == ret) ? "success": "fail", ret, attach_block_id,
file_id, lease_id, length, offset,
tbsys::CNetUtil::addrToString(peer_id).c_str(), op_stat.cost_);
}
return TFS_SUCCESS;
}
参考: