BeeGFS源码分析3-创建目录

本文深入分析了BeeGFS分布式文件系统在创建目录过程中的关键步骤,涉及管理服务、元数据服务和客户端的交互。重点介绍了管理服务如何处理元数据根节点的获取与设置,以及在节点心跳消息中初始化根元数据服务NodeID的过程。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

这篇文章主要分析BeeGFS在创建目录时的过程中,管理服务,元数据服务和客户端的关键部分。由于创建目录的整个过程中只有元数据,所以不牵扯存储服务。

管理服务

根元数据服务的NodeID是哪个,由管理服务负责。

处理Root服务NodeID获取消息

  • 管理服务可以处理其他服务的获取元数据根节点的消息,具体的函数如下:
// fhgfs_mgmtd\source\net\message\nodes\GetNodesMsgEx.cpp

bool GetNodesMsgEx::processIncoming(ResponseContext& ctx)
{
   LogContext log("GetNodes incoming");

   LOG_DEBUG_CONTEXT(log, Log_DEBUG, "Received a GetNodesMsg from: " + ctx.peerName() );

   App* app = Program::getApp();
   NodeType nodeType = getNodeType();

   LOG_DEBUG_CONTEXT(log, Log_SPAM, std::string("NodeType: ") + Node::nodeTypeToStr(nodeType) );


   // get corresponding node store

   AbstractNodeStore* nodes = app->getAbstractNodeStoreFromType(nodeType);
   if(!nodes)

   {
      LOG(ERR, "Invalid node type.",
            as("Node Type", Node::nodeTypeToStr(nodeType)),
            as("Sender", ctx.peerName())
         );
      return false;
   }

   // get root ID

   NumNodeID rootNumID;
   bool rootIsBuddyMirrored = false;

   if(nodeType == NODETYPE_Meta)
   {
      rootNumID = app->getMetaNodes()->getRootNodeNumID();
      rootIsBuddyMirrored = app->getMetaNodes()->getRootIsBuddyMirrored();
   }
   // reference/retrieve all nodes from the given store and send them

   auto nodeList = nodes->referenceAllNodes();

   ctx.sendResponse(GetNodesRespMsg(rootNumID, rootIsBuddyMirrored, nodeList));

   return true;
}

处理Root服务NodeID设置消息

  • 管理服务同时也可以设置根元数据服务的NodeID:
// fhgfs_mgmtd\source\net\message\nodes\SetRootNodeIDMsgEx.cpp

bool SetRootNodeIDMsgEx::processIncoming(ResponseContext& ctx)
{
   LogContext log("SetRootNodeIDMsg incoming");

   LOG_DEBUG_CONTEXT(log, 4, "Received a SetRootNodeIDMsg from: " + ctx.peerName() );

   App* app = Program::getApp();

   if (app->isShuttingDown())
   {
      ctx.sendResponse(GenericResponseMsg(GenericRespMsgCode_TRYAGAIN, "Mgmtd shutting down."));
      return true;
   }

   NodeStoreServers* metaNodes = app->getMetaNodes();

   FhgfsOpsErr result;

   NumNodeID newRootNodeID(getRootNodeID() );
   uint16_t rootIsBuddyMirrored = getRootIsBuddyMirrored();
   
   bool setRes = metaNodes->setRootNodeNumID(newRootNodeID, true, rootIsBuddyMirrored);
   if(setRes)
      result = FhgfsOpsErr_SUCCESS;
   else
      result = FhgfsOpsErr_INTERNAL;

   ctx.sendResponse(SetRootNodeIDRespMsg(result) );

   return true;
}

处理节点心跳消息

  • 当管理服务受到元数据节点的心跳信息时,也会根据其信息进行根元数据服务NodeID的初始化:
// fhgfs_mgmtd\source\net\message\nodes\HeartbeatMsgEx.cpp

bool HeartbeatMsgEx::processIncoming(ResponseContext& ctx)
{
   LogContext log("Heartbeat incoming");

   //LOG_DEBUG_CONTEXT(log, Log_DEBUG, std::string("Received a HeartbeatMsg from: ") + peer);

   App* app = Program::getApp();

   if (app->isShuttingDown())
      return true;

   NodeCapacityPools* metaCapacityPools = app->getMetaCapacityPools();
   HeartbeatManager* heartbeatMgr = app->getHeartbeatMgr();

   bool isNodeNew;

   NodeType nodeType = getNodeType();
   std::string nodeID(getNodeID());

   NicAddressList& nicList = getNicList();

   // check for empty nodeID; (sanity check, should never fail)

   if(unlikely(nodeID.empty() ) )
   {
      log.log(Log_WARNING, "Rejecting heartbeat of node with empty long ID "
         "from: " + ctx.peerName() + "; "
         "type: " + Node::nodeTypeToStr(nodeType) );

      return false;
   }


   if(nodeType == NODETYPE_Client)
   { // this is a client heartbeat
      NodeStoreClients* clients = app->getClientNodes();

      // construct node

      auto node = RegisterNodeMsgEx::constructNode(
         nodeID, getNodeNumID(), getPortUDP(), getPortTCP(), nicList);

      node->setNodeType(getNodeType() );
      node->setFhgfsVersion(getFhgfsVersion() );
      node->setFeatureFlags(&getNodeFeatureFlags() );

      // add node to store (or update it)

      isNodeNew = clients->addOrUpdateNode(std::move(node));
   }
   else
   { // this is a server heartbeat

      /* only accept new servers if nodeNumID is set
         (otherwise RegisterNodeMsg would need to be called first) */

      if(!getNodeNumID() )
      { /* shouldn't happen: this server would need to register first to get a nodeNumID assigned */

         log.log(Log_WARNING,
            "Rejecting heartbeat of node without numeric ID: " + nodeID + "; "
            "type: " + Node::nodeTypeToStr(nodeType) );

         return false;
      }

      // get the corresponding node store for this node type

      NodeStoreServers* servers = app->getServerStoreFromType(nodeType);
      if(unlikely(!servers) )
      {
         LOG(ERR, "Invalid node type.",
               as("Node Type", Node::nodeTypeToStr(getNodeType())),
               as("Sender", ctx.peerName()),
               as("NodeID", getNodeNumID()),
               as("Port (UDP)", getPortUDP()),
               as("Port (TCP)", getPortTCP())
            );

         return false;
      }

      // check if adding a new server is allowed (in case this is a server)

      if(!RegisterNodeMsgEx::checkNewServerAllowed(servers, getNodeNumID(), nodeType) )
      { // this is a new server and adding was disabled
         log.log(Log_WARNING, std::string("Registration of new servers disabled. Rejecting: ") +
            nodeID + " (Type: " + Node::nodeTypeToStr(nodeType) + ")");

         return true;
      }

      // construct node

      auto node = RegisterNodeMsgEx::constructNode(
         nodeID, getNodeNumID(), getPortUDP(), getPortTCP(), nicList);

      node->setNodeType(nodeType);
      node->setFhgfsVersion(getFhgfsVersion() );
      node->setFeatureFlags(&getNodeFeatureFlags() );

      std::string typedNodeID = node->getTypedNodeID();

      // add node to store (or update it)

      NumNodeID confirmationNodeNumID;

      isNodeNew = servers->addOrUpdateNodeEx(std::move(node), &confirmationNodeNumID);

      if(confirmationNodeNumID != getNodeNumID() )
      { // unable to add node to store
         log.log(Log_WARNING, "Node rejected because of ID conflict. "
            "Given numeric ID: " + getNodeNumID().str() + "; "
            "string ID: " + getNodeID() + "; "
            "type: " + Node::nodeTypeToStr(nodeType) );

         return true;
      }

      // add to capacity pools

      if(nodeType == NODETYPE_Meta)
      {
         app->getMetaStateStore()->addIfNotExists(getNodeNumID().val(), CombinedTargetState(
            TargetReachabilityState_POFFLINE, TargetConsistencyState_GOOD) );

         bool isNewMetaTarget = metaCapacityPools->addIfNotExists(
            confirmationNodeNumID.val(), CapacityPool_LOW);

         if(isNewMetaTarget)
            heartbeatMgr->notifyAsyncAddedNode(nodeID, getNodeNumID(), nodeType);

         // (note: storage targets get published through MapTargetMsg)
      }

      // handle root node information (if any is given)
      RegisterNodeMsgEx::processIncomingRoot(getRootNumID(), nodeType, getRootIsBuddyMirrored());

   } // end of server heartbeat specific handling


   if(isNodeNew)
   { // this node is new
      RegisterNodeMsgEx::processNewNode(nodeID, getNodeNumID(), nodeType, getFhgfsVersion(),
         &nicList, ctx.peerName() );
   }

   // send response
   acknowledge(ctx);

   if (nodeType == NODETYPE_Meta)
      app->getMetaStateStore()->saveStatesToFile();
   else if (nodeType == NODETYPE_Storage)
      app->getTargetStateStore()->saveStatesToFile();

   return true;
}

// fhgfs_mgmtd\source\net\message\nodes\RegisterNodeMsgEx.cpp

/**
 * Handles the contained root information in a MDS heartbeat.
 *
 * @param nodeType the type of node that sent the rootNumID
 */
void RegisterNodeMsgEx::processIncomingRoot(NumNodeID rootNumID, NodeType nodeType,
   bool rootIsBuddyMirrored)
{
   /* Note: Be careful not to call this before we actually added the node (if possible), because
      we will try to send a corresponding root heartbeat to the other nodes if the root node can
      be referenced from the store. */

   // check whether root info is defined
   if(nodeType != NODETYPE_Meta)
      return;

   // try to apply the contained root info
   Program::getApp()->getHeartbeatMgr()->initRootNode(rootNumID, rootIsBuddyMirrored);
}

设置元数据Root服务NodeID

  • 在设置根元数据服务NodeID时,优先选择数值小的那个:
// fhgfs_mgmtd\source\components\HeartbeatManager.cpp

/**
 * @param rootIDHint empty string to auto-define root or a nodeID that is assumed to be the root
 * @return true if a new root node has been defined
 */
bool HeartbeatManager::initRootNode(NumNodeID rootIDHint, bool rootIsBuddyMirrored)
{
   // be careful: this method is also called from other threads
   // note: after this method, the root node might still be undefined (this is normal)

   bool setRootRes = false;

   if( (rootIDHint != 0) || (metaNodes->getSize()) )
   { // check whether root has already been set

      if(rootIDHint == 0)
         rootIDHint = metaNodes->getLowestNodeID();

      // set root to lowest ID (if no other root was set yet)
      setRootRes = metaNodes->setRootNodeNumID(rootIDHint, false, rootIsBuddyMirrored);

      if(setRootRes)
      { // new root set
         log.log(Log_CRITICAL, "New root directory metadata node: " +
            Program::getApp()->getMetaNodes()->getNodeIDWithTypeStr(rootIDHint) );

         notifyAsyncAddedNode("", rootIDHint, NODETYPE_Meta); /* (real string ID will
            be retrieved by notifier before sending the heartbeat) */
      }
   }

   return setRootRes;
}


// fhgfs_common\source\common\nodes\NodeStoreServers.cpp

/**
 * Set internal root node ID.
 *
 * @return false if the new ID was rejected (e.g. because we already had an id set and
 * ignoreExistingRoot was false).
 */
bool NodeStoreServers::setRootNodeNumID(NumNodeID id, bool ignoreExistingRoot, bool isBuddyMirrored)
{
   // don't allow invalid id 0 (if not forced to do so)
   if(!id && !ignoreExistingRoot)
      return false;

   std::lock_guard<Mutex> lock(mutex);

   bool setRootRes = true;

   if(!this->rootNodeID)
   { // no root defined yet => set the new root
      this->rootNodeID = id;
      // set buddy mirrored info
      rootIsBuddyMirrored = isBuddyMirrored;
   }
   else
   if(!ignoreExistingRoot)
   { // root defined already, reject new root
      setRootRes = false;
   }
   else
   { // root defined already, but shall be ignored
      this->rootNodeID = id;
      // set buddy mirrored info
      rootIsBuddyMirrored = isBuddyMirrored;
   }

   return setRootRes;
}

客户端

元数据Root节点获取

  • 客户端在初始化时会把根元数据服务的NodeID设置为0,然后再从根管理服务之间的心跳消息取回真正的根元数据服务NodeID:
// beegfs-6.18\fhgfs_client_module\source\app\App.c

bool __App_initDataObjects(App* this, MountConfig* mountConfig)
   this->metaNodes = NodeStoreEx_construct(this, NODETYPE_Meta);
}

// fhgfs_client_module\source\nodes\NodeStoreEx.c

NodeStoreEx* NodeStoreEx_construct(App* app, NodeType storeType)
{
   NodeStoreEx* this = (NodeStoreEx*)os_kmalloc(sizeof(*this) );

   NodeStoreEx_init(this, app, storeType);

   return this;
}


// fhgfs_client_module\source\nodes\NodeStoreEx.c

/**
 * @param storeType will be applied to nodes on addOrUpdate()
 */
void NodeStoreEx_init(NodeStoreEx* this, App* app, NodeType storeType)
{
   this->app = app;

   RWLock_init(&this->rwLock);

   NodeTree_init(&this->nodeTree);

   this->newNodeAppeared = NULL;

   this->_rootOwner = NodeOrGroup_fromGroup(0); // 0 means undefined/invalid

   this->storeType = storeType;
}

/**
 * @return 0 if no root node is known
 */
NodeOrGroup NodeStoreEx_getRootOwner(NodeStoreEx* this)
{
   NodeOrGroup owner;

   RWLock_readLock(&this->rwLock); // L O C K

   owner = this->_rootOwner;

   RWLock_readUnlock(&this->rwLock); // U N L O C K

   return owner;
}


// fhgfs_client_module\source\common\net\message\nodes\HeartbeatMsgEx.c

/**
 * Handles the contained root information.
 */
void __HeartbeatMsgEx_processIncomingRoot(HeartbeatMsgEx* this, App* app)
{
   Logger* log = App_getLogger(app);
   const char* logContext = "Heartbeat incoming (root)";

   NodeStoreEx* metaNodes;
   bool setRootRes;
   NodeOrGroup rootOwner = this->rootIsBuddyMirrored
      ? NodeOrGroup_fromGroup(this->rootNumID.value)
      : NodeOrGroup_fromNode(this->rootNumID);
   NumNodeID rootNumID = HeartbeatMsgEx_getRootNumID(this);

   // check whether root info is defined
   if( (HeartbeatMsgEx_getNodeType(this) != NODETYPE_Meta) || (NumNodeID_isZero(&rootNumID)))
      return;

   // try to apply the contained root info

   metaNodes = App_getMetaNodes(app);

   setRootRes = NodeStoreEx_setRootOwner(metaNodes, rootOwner, false);

   if(setRootRes)
   { // found the very first root
      Logger_logFormatted(log, Log_CRITICAL, logContext, "Root (by Heartbeat): %hu",
         HeartbeatMsgEx_getRootNumID(this) );
   }

}


// fhgfs_client_module\source\components\InternodeSyncer.c

void __InternodeSyncer_downloadAndSyncNodes(InternodeSyncer* this)
{
...
   if(NodesTk_downloadNodes(this->app, mgmtNode, NODETYPE_Meta, &metaNodesList, &rootNodeID,
      &rootIsBuddyMirrored) )
   {
      const NodeOrGroup rootOwner = rootIsBuddyMirrored
         ? NodeOrGroup_fromGroup(rootNodeID.value)
         : NodeOrGroup_fromNode(rootNodeID);

      NodeStoreEx_syncNodes(this->metaNodes,
         &metaNodesList, &addedMetaNodes, &removedMetaNodes, localNode);
      NodeStoreEx_setRootOwner(this->metaNodes, rootOwner, false);
      __InternodeSyncer_printSyncResults(this, NODETYPE_Meta, &addedMetaNodes, &removedMetaNodes);
   }
...
}

// fhgfs_client_module\source\common\toolkit\NodesTk.c

/**
 * Download node list from given source node.
 * 
 * @param sourceNode the node from which node you want to download
 * @param nodeType which type of node list you want to download
 * @param outNodeList caller is responsible for the deletion of the received nodes
 * @param outRootID may be NULL if caller is not interested
 * @param outRootIsBuddyMirrored may be NULL if caller is not interested
 * @return true if download successful
 */
bool NodesTk_downloadNodes(App* app, Node* sourceNode, NodeType nodeType, NodeList* outNodeList,
   NumNodeID* outRootNodeID, bool* outRootIsBuddyMirrored)
{
   bool retVal = false;

   GetNodesMsg msg;

   FhgfsOpsErr commRes;
   GetNodesRespMsg* respMsgCast;
   RequestResponseArgs rrArgs;

   // prepare request
   GetNodesMsg_initFromValue(&msg, nodeType);
   RequestResponseArgs_prepare(&rrArgs, sourceNode, (NetMessage*)&msg, NETMSGTYPE_GetNodesResp);

#ifndef BEEGFS_DEBUG
   // Silence log message unless built in debug mode.
   rrArgs.logFlags |= ( REQUESTRESPONSEARGS_LOGFLAG_CONNESTABLISHFAILED
                      | REQUESTRESPONSEARGS_LOGFLAG_RETRY );
#endif // BEEGFS_DEBUG

   // connect & communicate

   commRes = MessagingTk_requestResponseWithRRArgs(app, &rrArgs);

   if(unlikely(commRes != FhgfsOpsErr_SUCCESS) )
      goto cleanup_request;

   // handle result
   respMsgCast = (GetNodesRespMsg*)rrArgs.outRespMsg;

   GetNodesRespMsg_parseNodeList(app, respMsgCast, outNodeList);

   if(outRootNodeID)
      *outRootNodeID = GetNodesRespMsg_getRootNumID(respMsgCast);

   if (outRootIsBuddyMirrored)
      *outRootIsBuddyMirrored = GetNodesRespMsg_getRootIsBuddyMirrored(respMsgCast);

   retVal = true;

   // cleanup

   RequestResponseArgs_freeRespBuffers(&rrArgs, app);

cleanup_request:
   return retVal;
}


根目录DirEntry获取

  • 在之后的文件系统Mount,初始化超级块时,向根元数据服务获取根目录的DirEntry(用于描述目录的属性信息):
// fhgfs_client_module\source\filesystem\FhgfsOpsSuper.c

/**
 * Fill the file system superblock (vfs object)
 */
int FhgfsOps_fillSuper(struct super_block* sb, void* rawMountOptions, int silent)
{
...
   // init root inode

   memset(&kstat, 0, sizeof(struct kstat) );

   kstat.ino = BEEGFS_INODE_ROOT_INO;
   kstat.mode = S_IFDIR | 0777; // allow access for everyone
   kstat.atime = kstat.mtime = kstat.ctime = current_fs_time(sb);
   kstat.uid = FhgfsCommon_getCurrentKernelUserID();
   kstat.gid = FhgfsCommon_getCurrentKernelGroupID();
   kstat.blksize = Config_getTuneInodeBlockSize(cfg);
   kstat.nlink = 1;

   // root entryInfo is always updated when someone asks for it (so we just set dummy values here)
   EntryInfo_init(&entryInfo, NodeOrGroup_fromGroup(0), StringTk_strDup(""), StringTk_strDup(""),
      StringTk_strDup(""), DirEntryType_DIRECTORY, 0);

   rootInode = __FhgfsOps_newInode(sb, &kstat, 0, &entryInfo, &iSizeHints);
   if(!rootInode || IS_ERR(rootInode) )
   {
      __FhgfsOps_destructFsInfo(sb);
      return IS_ERR(rootInode) ? PTR_ERR(rootInode) : -ENOMEM;
   }

   rootDentry = d_make_root(rootInode);
   if(!rootDentry)
   {
      __FhgfsOps_destructFsInfo(sb);
      return -ENOMEM;
   }
...
}

// fhgfs_client_module\source\common\toolkit\MetadataTk.c

/**
 * @param outEntryInfo contained values will be kalloced (on success) and need to be kfreed with
 * FhgfsInode_freeEntryMinInfoVals() later.
 */
bool MetadataTk_getRootEntryInfoCopy(App* app, EntryInfo* outEntryInfo)
{
   NodeStoreEx* nodes = App_getMetaNodes(app);

   NodeOrGroup rootOwner = NodeStoreEx_getRootOwner(nodes);
   const char* parentEntryID = StringTk_strDup("");
   const char* entryID = StringTk_strDup(META_ROOTDIR_ID_STR);
   const char* dirName = StringTk_strDup("");
   DirEntryType entryType = (DirEntryType) DirEntryType_DIRECTORY;

   /* Even if rootOwner is invalid, we still init outEntryInfo and malloc as FhGFS
    * policy says that kfree(NULL) is not allowed (the kernel allows it). */

   EntryInfo_init(outEntryInfo, rootOwner, parentEntryID, entryID, dirName, entryType, 0);

   return NodeOrGroup_valid(rootOwner);
}

创建目录的入口函数

  • 根据父目录的DirEntry中包含的的父目录信息构造要创建目录的DirEntry结构:
// fhgfs_client_module\source\filesystem\FhgfsOpsInode.c

/**
 * Create directory.
 */
#ifdef KERNEL_HAS_UMODE_T
int FhgfsOps_mkdir(struct inode* dir, struct dentry* dentry, umode_t mode)
#else
int FhgfsOps_mkdir(struct inode* dir, struct dentry* dentry, int mode)
#endif // KERNEL_HAS_UMODE_T
{
   struct super_block* sb = dentry->d_sb;
   App* app = FhgfsOps_getApp(sb);
   Logger* log = App_getLogger(app);
   const char* logContext = "FhgfsOps_mkdir";

   int retVal = 0;
   FhgfsOpsErr mkRes;
   FhgfsInode* fhgfsParentInode = BEEGFS_INODE(dir);
   EntryInfo newEntryInfo;
   const char* entryName = dentry->d_name.name;
   const int umask = current_umask();
   struct CreateInfo createInfo;

   struct inode* inode = dentry->d_inode;

   FhgfsIsizeHints iSizeHints;

   if(unlikely(Logger_getLogLevel(log) >= 5) )
      FhgfsOpsHelper_logOp(5, app, dentry, inode, logContext);

   mode |= S_IFDIR; // just make sure this is a dir

   CreateInfo_init(app, dir, entryName, mode, umask, false, &createInfo);

   FhgfsInode_entryInfoReadLock(fhgfsParentInode); // LOCK EntryInfo

   mkRes = FhgfsOpsRemoting_mkdir(app, FhgfsInode_getEntryInfo(fhgfsParentInode), &createInfo,
      &newEntryInfo);

   FhgfsInode_entryInfoReadUnlock(fhgfsParentInode); // UNLOCK EntryInfo

   if(mkRes != FhgfsOpsErr_SUCCESS)
   {
      retVal = FhgfsOpsErr_toSysErr(mkRes);
   }
   else
   { // remote success => create the local inode
      retVal = __FhgfsOps_instantiateInode(dentry, &newEntryInfo, NULL, &iSizeHints);

      dir->i_ctime = dir->i_mtime = current_fs_time(sb);
   }

   return retVal;
}
// fhgfs_client_module\source\common\toolkit\MetadataTk.c

/**
 * Used to initialize struct CreateInfo. This function always should be used, to make sure,
 * values are not forgotten.
 *
 * Note: preferred meta/storage targets are automatically set to app's preferred meta/storage
 *       targets; keep that in mind when you're cleaning up.
 */
void CreateInfo_init(App* app, struct inode* parentDirInode, const char* entryName,
   int mode, int umask, bool isExclusiveCreate, CreateInfo* outCreateInfo)
{
   outCreateInfo->userID = FhgfsCommon_getCurrentUserID();

   // groupID and mode logic taken from inode_init_owner()
   if (parentDirInode && (parentDirInode->i_mode & S_ISGID) )
   {
      outCreateInfo->groupID = i_gid_read(parentDirInode);
      if (S_ISDIR(mode))
         mode |= S_ISGID;
   }
   else
      outCreateInfo->groupID = FhgfsCommon_getCurrentGroupID();

   outCreateInfo->entryName = entryName;
   outCreateInfo->mode      = mode;
   outCreateInfo->umask     = umask;
   outCreateInfo->isExclusiveCreate = isExclusiveCreate;

   outCreateInfo->preferredStorageTargets = App_getPreferredStorageTargets(app);
   outCreateInfo->preferredMetaTargets    = App_getPreferredMetaNodes(app);
}

Inode读写锁

  • 在创建目录期间会对其进行加锁操作,具体如下:
// fhgfs_client_module\source\filesystem\FhgfsInode.h

/*
 * Get an EntryInfo read-lock.
 *
 * Note: Must be taken on reading parentEntryID. Should not be taken if only entryID is read,
 *       as entryID is never updated.
 *       See FhgfsInode_updateEntryInfoUnlocked() and FhgfsInode_updateEntryInfoOnRenameUnlocked()
 * Note: If the root inode is not initialized, it will be initialized by this method under
 *       a write-lock.
 */
void FhgfsInode_entryInfoReadLock(FhgfsInode* this)
{
   _FhgfsInode_initRootEntryInfo(this); // NOTE: might temporarily writelock entryInfoLock

   RWLock_readLock(&this->entryInfoLock); // Read-LOCK
}

void FhgfsInode_entryInfoReadUnlock(FhgfsInode* this)
{
   RWLock_readUnlock(&this->entryInfoLock);
}

/**
 * Note: Also might initialize the root-inode.
 */
void FhgfsInode_entryInfoWriteLock(FhgfsInode* this)
{
   _FhgfsInode_initRootEntryInfo(this); // NOTE: might temporarily writelock entryInfoLock

   RWLock_writeLock(&this->entryInfoLock); // Write-LOCK
}

void FhgfsInode_entryInfoWriteUnlock(FhgfsInode* this)
{
   RWLock_writeUnlock(&this->entryInfoLock);
}
  • 读写锁的基本操作函数:

// fhgfs_client_module\source\common\threading\RWLock.h

void RWLock_init(RWLock* this)
{
   init_rwsem(&this->rwSem);
}

void RWLock_uninit(RWLock* this)
{
   // rw_semaphores don't need any kind of uninit
}

void RWLock_writeLock(RWLock* this)
{
   down_write(&this->rwSem);
}

/**
 * Try locking and return immediately even if lock cannot be aqcuired immediately.
 *
 * @return 1 if lock acquired, 0 if contention
 */
int RWLock_writeTryLock(RWLock* this)
{
   return down_write_trylock(&this->rwSem);
}

void RWLock_readLock(RWLock* this)
{
   down_read(&this->rwSem);
}

void RWLock_writeUnlock(RWLock* this)
{
   up_write(&this->rwSem);
}

void RWLock_readUnlock(RWLock* this)
{
   up_read(&this->rwSem);
}

目录创建远程请求

  • 当构造完目录的DirEntry结构后,就会向父目录元数据节点发送创建目录消息:
// fhgfs_client_module\source\net\filesystem\FhgfsOpsRemoting.c

/**
 * @param outEntryInfo attribs set only in case of success (and must then be kfreed by the
 * caller)
 */
FhgfsOpsErr FhgfsOpsRemoting_mkdir(App* app, const EntryInfo* parentInfo,
   struct CreateInfo* createInfo, EntryInfo* outEntryInfo)
{
   Logger* log = App_getLogger(app);
   const char* logContext = "Remoting (mkdir)";

   MkDirMsg requestMsg;
   RequestResponseNode rrNode = {
      .peer = rrpeer_from_entryinfo(parentInfo),
      .nodeStore = app->metaNodes,
      .targetStates = app->metaStateStore,
      .mirrorBuddies = app->metaBuddyGroupMapper
   };
   RequestResponseArgs rrArgs;
   FhgfsOpsErr requestRes;
   MkDirRespMsg* mkResp;
   FhgfsOpsErr retVal;

   // prepare request
   MkDirMsg_initFromEntryInfo(&requestMsg, parentInfo, createInfo);

   RequestResponseArgs_prepare(&rrArgs, NULL, (NetMessage*)&requestMsg, NETMSGTYPE_MkDirResp);

   // communicate
   requestRes = MessagingTk_requestResponseNodeRetryAutoIntr(app, &rrNode, &rrArgs);

   if(unlikely(requestRes != FhgfsOpsErr_SUCCESS) )
   { // clean-up
      retVal = requestRes;
      goto cleanup_request;
   }

   // handle result
   mkResp = (MkDirRespMsg*)rrArgs.outRespMsg;
   retVal = (FhgfsOpsErr)MkDirRespMsg_getResult(mkResp);

   if(retVal == FhgfsOpsErr_SUCCESS)
   { // success
      EntryInfo_dup(MkDirRespMsg_getEntryInfo(mkResp), outEntryInfo );
   }
   else
   {
      int logLevel = Log_NOTICE;

      if(retVal == FhgfsOpsErr_EXISTS)
         logLevel = Log_DEBUG; // don't bother user with non-error messages

      Logger_logFormatted(log, logLevel, logContext,
         "MkDirResp ownerID: %u%s parentID: %s name: %s error code: %s ",
         EntryInfo_getOwner(parentInfo), EntryInfo_getOwnerFlag(parentInfo), parentInfo->entryID,
         createInfo->entryName, FhgfsOpsErr_toErrString(retVal));
   }

   // clean-up
   RequestResponseArgs_freeRespBuffers(&rrArgs, app);

cleanup_request:
   return retVal;
}
  • 消息的初始化创建:
// fhgfs_client_module\source\common\net\message\storage\creating\MkDirMsg.h

void MkDirMsg_init(MkDirMsg* this)
{
   NetMessage_init(&this->netMessage, NETMSGTYPE_MkDir, &MkDirMsg_Ops);
}

/**
 * @param path just a reference, so do not free it as long as you use this object!
 */
void MkDirMsg_initFromEntryInfo(MkDirMsg* this, const EntryInfo* parentInfo,
   struct CreateInfo* createInfo)
{
   MkDirMsg_init(this);
   
   this->parentInfo = parentInfo;
   this->newDirName = createInfo->entryName;
   this->newDirNameLen = strlen(createInfo->entryName);

   this->userID  = createInfo->userID;
   this->groupID = createInfo->groupID;
   this->mode    = createInfo->mode;
   this->umask   = createInfo->umask;
   this->preferredNodes = createInfo->preferredMetaTargets;
}

向远程节点发送消息

  • 向元数据节点发送创建目录消息:
// fhgfs_client_module\source\common\toolkit\MessagingTk.c

/**
 * Sends a message to a node and receives a response.
 * Can handle target states and mapped mirror IDs. Node does not need to be referenced by caller.
 *
 * If target states are provided, communication might be skipped for certain states.
 *
 * This version will automatically block signals including interrupt and reenables interrupt if the
 * response doesn't arrive within a few seconds. This is intended to help avoid cases where we miss
 * a reply due to the user pressing ctrl+c (under the assumption that the user won't care to wait a
 * fews seconds longer for interruption).
 *
 * note: uses the number of retries that has been defined in the app config.
 * note: blocked signals will be restored to original values before this method returns.
 *
 * @param rrArgs outRespBuf must be returned to the store - not freed; rrArgs->nodeID may optionally
 *    be provided when calling this.
 * @return received message and buffer are available through rrArgs in case of success.
 */
FhgfsOpsErr MessagingTk_requestResponseNodeRetryAutoIntr(App* app, RequestResponseNode* rrNode,
   RequestResponseArgs* rrArgs)
{
   Config* cfg = App_getConfig(app);

   sigset_t oldSignalSet;

   FhgfsOpsErr rrRes;

   rrArgs->numRetries = Config_getConnNumCommRetries(cfg);
   rrArgs->rrFlags = REQUESTRESPONSEARGS_FLAG_USEDELAYEDINTERRUPT |
      REQUESTRESPONSEARGS_FLAG_ALLOWSTATESLEEP;
   rrArgs->respBufType = MessagingTkBufType_BufStore;

   // keep thread interruptible for connect (will later be made uninterruptible for send/recv)
   SignalTk_blockSignals(true, &oldSignalSet); // B L O C K _ S I G s

   rrRes = __MessagingTk_requestResponseNodeRetry(app, rrNode, rrArgs);

   SignalTk_restoreSignals(&oldSignalSet); // U N B L O C K _ S I G s

   return rrRes;
}
  • 发送消息并等待返回:
// fhgfs_client_module\source\common\toolkit\MessagingTk.c

/**
 * Sends a message to a node and receives a response.
 * Can handle target states and mapped mirror IDs. Node does not need to be referenced by caller.
 *
 * If target states are provided, communication might be skipped for certain states.
 *
 * @param rrArgs rrArgs->nodeID may optionally be provided when calling this.
 * @return received message and buffer are available through rrArgs in case of success.
 */
FhgfsOpsErr __MessagingTk_requestResponseNodeRetry(App* app, RequestResponseNode* rrNode,
   RequestResponseArgs* rrArgs)
{
   const char* logContext = "Messaging (RPC node)";

   unsigned currentRetryNum = 0; // used number of retries so far
   FhgfsOpsErr commRes;
   struct BuddySequenceNumber* handle = NULL;
   struct MirrorBuddyGroup* group = NULL;
   bool wasIndirectCommErr = false;

   BEEGFS_BUG_ON_DEBUG(rrNode->targetStates == NULL, "targetStates missing");
   BEEGFS_BUG_ON_DEBUG(rrNode->mirrorBuddies == NULL, "mirrorBuddies missing");

   for( ; ; ) // retry loop
   {
      bool nodeNeedsRelease = false;
      int acquireSeqRes = 0;
      bool seqAckIsSelective = false;

      // select the right targetID

      NumNodeID nodeID; // don't modify caller's nodeID

      if (rrNode->peer.isMirrorGroup)
      { // given targetID refers to a buddy mirror group
         nodeID = (NumNodeID){MirrorBuddyGroupMapper_getPrimaryTargetID(rrNode->mirrorBuddies,
               rrNode->peer.address.group)};

         if (unlikely(NumNodeID_isZero(&nodeID)))
         {
            Logger* log = App_getLogger(app);
            Logger_logErrFormatted(log, logContext, "Invalid mirror buddy group ID: %u",
               rrNode->peer.address.group);

            commRes = FhgfsOpsErr_UNKNOWNNODE;
            goto exit;
         }

         if (rrArgs->requestMsg->ops->supportsSequenceNumbers)
         {
            rrArgs->requestMsg->msgHeader.msgFlags |= MSGHDRFLAG_HAS_SEQUENCE_NO;

            if (rrArgs->requestMsg->msgHeader.msgSequence == 0)
               acquireSeqRes = MirrorBuddyGroupMapper_acquireSequenceNumber(rrNode->mirrorBuddies,
                     rrNode->peer.address.group, &rrArgs->requestMsg->msgHeader.msgSequence,
                     &rrArgs->requestMsg->msgHeader.msgSequenceDone, &seqAckIsSelective, &handle,
                     &group);

            if (!acquireSeqRes)
            {
               if (seqAckIsSelective)
                  rrArgs->requestMsg->msgHeader.msgFlags |= MSGHDRFLAG_IS_SELECTIVE_ACK;
            }
            else
            {
               Logger* log = App_getLogger(app);
               NodeType storeType = NodeStoreEx_getStoreType(rrNode->nodeStore);
               Logger_logFormatted(log, Log_WARNING, logContext,
                     "Could not generate seq#. Group IP: %u; type: %s", rrNode->peer.address.group,
                     Node_nodeTypeToStr(storeType));

               commRes = acquireSeqRes == EINTR ? FhgfsOpsErr_INTERRUPTED : FhgfsOpsErr_UNKNOWNNODE;
               goto exit;
            }
         }
      }
      else
         nodeID = rrNode->peer.address.target;

      // check target state

      if (rrNode->targetStates)
      {
         CombinedTargetState state;
         bool getStateRes = TargetStateStore_getState(rrNode->targetStates, nodeID.value,
            &state);

         if (!getStateRes ||
               state.reachabilityState != TargetReachabilityState_ONLINE ||
               (rrNode->peer.isMirrorGroup &&
                  state.consistencyState != TargetConsistencyState_GOOD))
         {
            if(state.reachabilityState == TargetReachabilityState_OFFLINE)
            { // no need to wait for offline servers
               LOG_DEBUG_FORMATTED(App_getLogger(app), Log_SPAM, logContext,
                  "Skipping communication with offline nodeID: %u", nodeID.value);

               commRes = FhgfsOpsErr_COMMUNICATION;
               goto exit;
            }

            if(!(rrArgs->rrFlags & REQUESTRESPONSEARGS_FLAG_ALLOWSTATESLEEP) )
            { // caller did not allow sleeping if target state is not {good, offline}
               LOG_DEBUG_FORMATTED(App_getLogger(app), Log_SPAM, logContext,
                  "Skipping communication with nodeID: %u; "
                  "target state: %s / %s",
                  nodeID.value, TargetStateStore_reachabilityStateToStr(state.reachabilityState),
                  TargetStateStore_consistencyStateToStr(state.consistencyState) );

               commRes = FhgfsOpsErr_COMMUNICATION;
               goto exit;
            }

            // sleep on states other than "good" and "offline" with mirroring
            if(rrNode->mirrorBuddies)
            {
               LOG_DEBUG_FORMATTED(App_getLogger(app), Log_DEBUG, logContext,
                  "Waiting before communication because of node state. "
                  "nodeID: %u; node state: %s / %s",
                  nodeID.value, TargetStateStore_reachabilityStateToStr(state.reachabilityState),
                  TargetStateStore_consistencyStateToStr(state.consistencyState) );

               Thread_sleep(MSGTK_STATE_SLEEP_MS);
               if(Thread_isSignalPending() )
               { // make sure we don't loop endless if signal pending
                  LOG_DEBUG_FORMATTED(App_getLogger(app), Log_DEBUG, logContext,
                     "Waiting before communication was interrupted by signal. "
                     "nodeID: %u; node state: %s / %s",
                     nodeID.value, TargetStateStore_reachabilityStateToStr(state.reachabilityState),
                     TargetStateStore_consistencyStateToStr(state.consistencyState) );

                  commRes = FhgfsOpsErr_INTERRUPTED;
                  goto exit;
               }

               currentRetryNum = 0; // reset retries in case of unusable target state
               continue;
            }
         }
      }

      // reference node (if not provided by caller already)

      if(!rrArgs->node)
      {
         rrArgs->node = NodeStoreEx_referenceNode(rrNode->nodeStore, nodeID);

         if(!rrArgs->node)
         {
            Logger* log = App_getLogger(app);
            NodeType storeType = NodeStoreEx_getStoreType(rrNode->nodeStore);
            Logger_logFormatted(log, Log_WARNING, logContext, "Unknown nodeID: %u; type: %s",
               nodeID.value, Node_nodeTypeToStr(storeType));

            commRes = FhgfsOpsErr_UNKNOWNNODE;
            goto exit;
         }

         nodeNeedsRelease = true;
      }
      else
         BEEGFS_BUG_ON_DEBUG(Node_getNumID(rrArgs->node).value != nodeID.value,
            "Mismatch between given rrArgs->node ID and nodeID");

      // communicate

      commRes = __MessagingTk_requestResponseWithRRArgsComm(app, rrArgs, group,
            &wasIndirectCommErr);

      if(likely(commRes == FhgfsOpsErr_SUCCESS) )
         goto release_node_and_break;
      else
      if(Thread_isSignalPending() )
      { // no retry allowed in this situation
         commRes = FhgfsOpsErr_INTERRUPTED;
         goto release_node_and_break;
      }
      else
      if(!Node_getIsActive(rrArgs->node) )
      { // no retry allowed in this situation
         commRes = FhgfsOpsErr_UNKNOWNNODE;
         goto release_node_and_break;
      }
      else
      if(commRes == FhgfsOpsErr_WOULDBLOCK)
      { // no retries in this case
         commRes = FhgfsOpsErr_COMMUNICATION;
         goto release_node_and_break;
      }
      else
      if( (commRes == FhgfsOpsErr_AGAIN) && App_getConnRetriesEnabled(app) )
      { // retry infinitely
         currentRetryNum = 0;

         Thread_sleep(MSGTK_INFINITE_RETRY_WAIT_MS); // sleep interruptible

         goto release_node_and_continue;
      }
      else
      if(commRes != FhgfsOpsErr_COMMUNICATION)
      { // no retry allowed in this situation
         goto release_node_and_break;
      }

      if(App_getConnRetriesEnabled(app) &&
         (!rrArgs->numRetries || (currentRetryNum < rrArgs->numRetries) ) )
      { // we have a retry left
         MessagingTk_waitBeforeRetry(currentRetryNum);
         currentRetryNum++;

         /* if the metadata server reports an indirect communication error, we must retry the
          * communication with a new sequence number. if we reuse the current sequence number, the
          * meta server will continue to reply "indirect communication error", sending us into a
          * very long loop of pointless retries, followed by -EIO to userspace. */
         if (wasIndirectCommErr && handle)
         {
            MirrorBuddyGroup_releaseSequenceNumber(group, &handle);
            rrArgs->requestMsg->msgHeader.msgSequence = 0;
            wasIndirectCommErr = false;
            handle = NULL;
         }

         if(currentRetryNum == 1 // log retry message only on first retry (to not spam the log)
            && !(rrArgs->logFlags & REQUESTRESPONSEARGS_LOGFLAG_RETRY) )
         {
            Logger* log = App_getLogger(app);
            Logger_logFormatted(log, Log_NOTICE, logContext,
               "Retrying communication with node: %s", Node_getNodeIDWithTypeStr(rrArgs->node) );
            Logger_logFormatted(log, Log_DEBUG, logContext,
               "Message type: %hu", NetMessage_getMsgType(rrArgs->requestMsg) );
         }
      }
      else
      { // no more retries left
         commRes = FhgfsOpsErr_COMMUNICATION;
         goto release_node_and_break;
      }

   release_node_and_continue:
      if(nodeNeedsRelease)
      {
         Node_put(rrArgs->node);
         rrArgs->node = NULL;
      }

      continue;

      // cleanup before early loop exit
   release_node_and_break:
      if(nodeNeedsRelease)
      {
         Node_put(rrArgs->node);
         rrArgs->node = NULL;
      }

      break;
   }

exit:
   if (handle)
      MirrorBuddyGroup_releaseSequenceNumber(group, &handle);

   return commRes;
}

使用套接字发送消息

  • 使用套接字进行数据收发,以及消息的序列化和反序列化:
// fhgfs_client_module\source\common\toolkit\MessagingTk.c

/**
 * Send a request message to a node and receive the response.
 *
 * @param rrArgs:
 * .node receiver of msg;
 * .requestMsg the message that should be sent to the receiver;
 * .respMsgType expected response message type;
 * .outRespBuf response buffer if successful (must be returned to store by the caller);
 * .outRespMsg response message if successful (must be deleted by the caller);
 * @return FhgfsOpsErr_COMMUNICATION on comm error, FhgfsOpsErr_WOULDBLOCK if remote side
 *    encountered an indirect comm error and suggests not to try again, FhgfsOpsErr_AGAIN if other
 *    side is suggesting infinite retries.
 */
FhgfsOpsErr __MessagingTk_requestResponseWithRRArgsComm(App* app,
   RequestResponseArgs* rrArgs, MirrorBuddyGroup* group, bool* wasIndirectCommErr)
{
   /* note: keep in mind that there are multiple alternative response buf alloc types avilable,
      e.g. "kmalloc" or "get from store". */

   Logger* log = App_getLogger(app);
   const char* logContext = "Messaging (RPC)";

   NodeConnPool* connPool = Node_getConnPool(rrArgs->node);

   FhgfsOpsErr retVal = FhgfsOpsErr_COMMUNICATION;

   sigset_t oldSignalSet;
   bool useDelayedRecvInterrupt = false;

   unsigned bufLen; // length of shared send/recv buffer
   unsigned sendBufLen; // serialization length for sending
   ssize_t respRes = 0;
   ssize_t sendRes;

   // cleanup init
   Socket* sock = NULL;
   rrArgs->outRespBuf = NULL;
   rrArgs->outRespMsg = NULL;

   // connect
   // note: acquireStreamSocket() will fail immediately if a signal is pending

   sock = NodeConnPool_acquireStreamSocket(connPool);
   if(unlikely(!sock) )
   { // not connected
      if(!(rrArgs->logFlags & REQUESTRESPONSEARGS_LOGFLAG_CONNESTABLISHFAILED) &&
         !Thread_isSignalPending() )
      { // only log once and only if user didn't manually interrupt with signal (to avoid log spam)
         Logger_logFormatted(log, Log_WARNING, logContext,
            "Unable to connect to: %s", Node_getNodeIDWithTypeStr(rrArgs->node) );
         Logger_logFormatted(log, Log_DEBUG, logContext,
            "Message type: %hu", NetMessage_getMsgType(rrArgs->requestMsg) );

         rrArgs->logFlags |= REQUESTRESPONSEARGS_LOGFLAG_CONNESTABLISHFAILED;
      }

      return FhgfsOpsErr_COMMUNICATION;
   }

   // block interrupt signal

   if(rrArgs->rrFlags & REQUESTRESPONSEARGS_FLAG_USEDELAYEDINTERRUPT)
   {
      SignalTk_blockSignals(false, &oldSignalSet); // B L O C K _ S I G s (incl. SIGINT)

      // (note: oldSignalSet is a set of previously blocked signals, so we use "!contains")
      useDelayedRecvInterrupt = !SignalTk_containsSignal(&oldSignalSet, SIGINT);
   }

   // prepare send buffer

   sendBufLen = NetMessage_getMsgLength(rrArgs->requestMsg);

   if(rrArgs->respBufType == MessagingTkBufType_BufStore)
   { // pre-alloc'ed buffer from store

      NoAllocBufferStore* bufStore = App_getMsgBufStore(app);
      bufLen = NoAllocBufferStore_getBufSize(bufStore);

      if(unlikely(bufLen < sendBufLen) )
      { // should never happen: trying to send a msg that is larger than pre-alloc'ed buf size
         Logger_logFormatted(log, Log_CRITICAL, logContext,
            "BufferStore buf size (%u) too small for msg length (%u). Message type: %hu",
            bufLen, sendBufLen, NetMessage_getMsgType(rrArgs->requestMsg) );

         retVal = FhgfsOpsErr_INTERNAL;
         goto socket_invalidate;
      }

      rrArgs->outRespBuf = NoAllocBufferStore_waitForBuf(bufStore);
   }
   else
   { // alloc'ed buffer

      bufLen = MAX(MSGTK_KMALLOC_RECV_BUF_LEN, sendBufLen);

      rrArgs->outRespBuf = (char*)os_kmalloc(bufLen);
      if(unlikely(!rrArgs->outRespBuf) )
      {
         Logger_logFormatted(log, Log_CRITICAL, logContext,
            "Buffer allocation failed. Message type: %hu; Alloc size: %u",
            NetMessage_getMsgType(rrArgs->requestMsg), bufLen);

         retVal = FhgfsOpsErr_OUTOFMEM;
         goto socket_invalidate;
      }
   }

   NetMessage_serialize(rrArgs->requestMsg, rrArgs->outRespBuf, sendBufLen);

   // send request

   sendRes = Socket_send(sock, rrArgs->outRespBuf, sendBufLen, 0);

   if(unlikely(sendRes != (ssize_t)sendBufLen) )
      goto socket_exception;

   // receive response
   // (note: recvMsgBufEx might re-enable SIGINT if useDelayedRecvInterrupt is set)

   respRes = MessagingTk_recvMsgBufEx(app, sock, rrArgs->outRespBuf, bufLen,
      useDelayedRecvInterrupt);

   if(unlikely(respRes <= 0) )
   { // error
      if(!(rrArgs->logFlags & REQUESTRESPONSEARGS_LOGFLAG_COMMERR) )
      {
         if(Thread_isSignalPending() )
            Logger_logFormatted(log, Log_NOTICE, logContext,
               "Receive interrupted by signal. Node: %s @ %s",
               Node_getNodeIDWithTypeStr(rrArgs->node), Socket_getPeername(sock) );
         else
         if(respRes == -ETIMEDOUT)
            Logger_logFormatted(log, Log_WARNING, logContext,
               "Receive timed out from %s @ %s",
               Node_getNodeIDWithTypeStr(rrArgs->node), Socket_getPeername(sock) );
         else
            Logger_logFormatted(log, Log_WARNING, logContext,
               "Receive failed from %s @ %s (recv result: %lld)",
               Node_getNodeIDWithTypeStr(rrArgs->node), Socket_getPeername(sock), respRes);

         Logger_logFormatted(log, Log_DEBUG, logContext,
            "Expected response type: %u", rrArgs->respMsgType);
      }

      goto socket_invalidate;
   }

   // got response => deserialize it
   rrArgs->outRespMsg = NetMessageFactory_createFromBuf(app, rrArgs->outRespBuf, respRes);

   if (unlikely(rrArgs->outRespMsg->msgHeader.msgType == NETMSGTYPE_AckNotifyResp))
   {
      /* a failover happened before the primary could send a negative response to us, and the
       * secondary has already received word about the failed operation. treat this case like a
       * communication error and retry the message with a new sequence number. */
      *wasIndirectCommErr = true;
      goto socket_invalidate;
   }

   if(unlikely(NetMessage_getMsgType(rrArgs->outRespMsg) == NETMSGTYPE_GenericResponse) )
   { // special control msg received
      retVal = __MessagingTk_handleGenericResponse(app, rrArgs, group, wasIndirectCommErr);
      if(retVal != FhgfsOpsErr_INTERNAL)
      { // we can re-use the connection
         NodeConnPool_releaseStreamSocket(connPool, sock);
         goto cleanup_no_socket;
      }

      goto socket_invalidate;
   }

   if(unlikely(NetMessage_getMsgType(rrArgs->outRespMsg) != rrArgs->respMsgType) )
   { // response invalid (wrong msgType)
      Logger_logErrFormatted(log, logContext,
         "Received invalid response type: %hu; expected: %d. Disconnecting: %s (%s)",
         NetMessage_getMsgType(rrArgs->outRespMsg), rrArgs->respMsgType,
         Node_getNodeIDWithTypeStr(rrArgs->node), Socket_getPeername(sock) );

      retVal = FhgfsOpsErr_INTERNAL;
      goto socket_invalidate;
   }

   // correct response => return it (through rrArgs)

   NodeConnPool_releaseStreamSocket(connPool, sock);

   if(rrArgs->rrFlags & REQUESTRESPONSEARGS_FLAG_USEDELAYEDINTERRUPT)
      SignalTk_restoreSignals(&oldSignalSet); // U N B L O C K _ S I G s

   return FhgfsOpsErr_SUCCESS;


   // error handling (something went wrong)...

   socket_exception:
   {
      if(!(rrArgs->logFlags & REQUESTRESPONSEARGS_LOGFLAG_COMMERR) )
      {
         Logger_logErrFormatted(log, logContext,
            "Communication error: Node: %s (comm result: %lld; message type: %hu)",
            Node_getNodeIDWithTypeStr(rrArgs->node),
            (long long)( (sendRes <= 0) ? sendRes : respRes),
            NetMessage_getMsgType(rrArgs->requestMsg) );

         rrArgs->logFlags |= REQUESTRESPONSEARGS_LOGFLAG_COMMERR;
      }
   }

   socket_invalidate:
   {
      NodeConnPool_invalidateStreamSocket(connPool, sock);
   }

   // clean up
   cleanup_no_socket:

   if(rrArgs->outRespMsg)
   {
      NETMESSAGE_FREE(rrArgs->outRespMsg);
      rrArgs->outRespMsg = NULL;
   }

   if(rrArgs->outRespBuf)
   {
      if(rrArgs->respBufType == MessagingTkBufType_BufStore)
      {
         NoAllocBufferStore* bufStore = App_getMsgBufStore(app);
         NoAllocBufferStore_addBuf(bufStore, rrArgs->outRespBuf);
      }
      else
         kfree(rrArgs->outRespBuf);

      rrArgs->outRespBuf = NULL;
   }

   if(rrArgs->rrFlags & REQUESTRESPONSEARGS_FLAG_USEDELAYEDINTERRUPT)
      SignalTk_restoreSignals(&oldSignalSet); // U N B L O C K _ S I G s

   return retVal;
}

使用消息工厂处理响应

  • 根据收到的数据生成返回消息对象:
// fhgfs_client_module\source\net\message\NetMessageFactory.c

/**
 * The standard way to create message objects from serialized message buffers.
 *
 * @return NetMessage which must be deleted by the caller
 * (msg->msgType is NETMSGTYPE_Invalid on error)
 */
NetMessage* NetMessageFactory_createFromBuf(App* app, char* recvBuf, size_t bufLen)
{
   NetMessageHeader header;
   NetMessage* msg;
   DeserializeCtx ctx = {
      .data = recvBuf,
      .length = bufLen,
   };

   // decode the message header
   __NetMessage_deserializeHeader(&ctx, &header);

   // create the message object for the given message type
   msg = NetMessageFactory_createFromMsgType(header.msgType);

   if(unlikely(NetMessage_getMsgType(msg) == NETMSGTYPE_Invalid) )
   {
      printk_fhgfs_debug(KERN_NOTICE,
         "Received an invalid or unhandled message. "
         "Message type (from raw header): %hu", header.msgType);

      return msg;
   }

   __NetMessageFactory_deserializeRaw(app, &ctx, &header, msg);
   return msg;
}

/**
 * @return NetMessage that must be deleted by the caller
 * (msg->msgType is NETMSGTYPE_Invalid on error)
 */
NetMessage* NetMessageFactory_createFromMsgType(unsigned short msgType)
{
#define HANDLE(ID, TYPE) case NETMSGTYPE_##ID: return NETMESSAGE_CONSTRUCT(TYPE)

   switch(msgType)
   {
      // control messages
      HANDLE(Ack, AckMsgEx);
      HANDLE(GenericResponse, GenericResponseMsg);
      // helperd messages
      HANDLE(GetHostByNameResp, GetHostByNameRespMsg);
      HANDLE(LogResp, LogRespMsg);
      // nodes messages
      HANDLE(GetMirrorBuddyGroupsResp, GetMirrorBuddyGroupsRespMsg);
      HANDLE(GetNodesResp, GetNodesRespMsg);
      HANDLE(GetStatesAndBuddyGroupsResp, GetStatesAndBuddyGroupsRespMsg);
      HANDLE(GetTargetMappingsResp, GetTargetMappingsRespMsg);
      HANDLE(GetTargetStatesResp, GetTargetStatesRespMsg);
      HANDLE(HeartbeatRequest, HeartbeatRequestMsgEx);
      HANDLE(Heartbeat, HeartbeatMsgEx);
      HANDLE(MapTargets, MapTargetsMsgEx);
      HANDLE(RefreshTargetStates, RefreshTargetStatesMsgEx);
      HANDLE(RegisterNodeResp, RegisterNodeRespMsg);
      HANDLE(RemoveNode, RemoveNodeMsgEx);
      HANDLE(RemoveNodeResp, RemoveNodeRespMsg);
      HANDLE(SetMirrorBuddyGroup, SetMirrorBuddyGroupMsgEx);
      // storage messages
      HANDLE(LookupIntentResp, LookupIntentRespMsg);
      HANDLE(MkDirResp, MkDirRespMsg);
      HANDLE(RmDirResp, RmDirRespMsg);
      HANDLE(MkFileResp, MkFileRespMsg);
      HANDLE(RefreshEntryInfoResp, RefreshEntryInfoRespMsg);
      HANDLE(RenameResp, RenameRespMsg);
      HANDLE(HardlinkResp, HardlinkRespMsg);;
      HANDLE(UnlinkFileResp, UnlinkFileRespMsg);
      HANDLE(MkLocalFileResp, MkLocalFileRespMsg);
      HANDLE(UnlinkLocalFileResp, UnlinkLocalFileRespMsg);
      HANDLE(ListDirFromOffsetResp, ListDirFromOffsetRespMsg);
      HANDLE(SetAttrResp, SetAttrRespMsg);
      HANDLE(StatResp, StatRespMsg);
      HANDLE(StatStoragePathResp, StatStoragePathRespMsg);
      HANDLE(TruncFileResp, TruncFileRespMsg);
      HANDLE(ListXAttrResp, ListXAttrRespMsg);
      HANDLE(GetXAttrResp, GetXAttrRespMsg);
      HANDLE(RemoveXAttrResp, RemoveXAttrRespMsg);
      HANDLE(SetXAttrResp, SetXAttrRespMsg);
      // session messages
      HANDLE(OpenFileResp, OpenFileRespMsg);
      HANDLE(CloseFileResp, CloseFileRespMsg);
      HANDLE(WriteLocalFileResp, WriteLocalFileRespMsg);
      HANDLE(FSyncLocalFileResp, FSyncLocalFileRespMsg);
      HANDLE(FLockAppendResp, FLockAppendRespMsg);
      HANDLE(FLockEntryResp, FLockEntryRespMsg);
      HANDLE(FLockRangeResp, FLockRangeRespMsg);
      HANDLE(LockGranted, LockGrantedMsgEx);
      HANDLE(BumpFileVersionResp, BumpFileVersionRespMsg);
      HANDLE(GetFileVersionResp, GetFileVersionRespMsg);

      case NETMSGTYPE_AckNotifyResp: {
         SimpleMsg* msg = os_kmalloc(sizeof(*msg));
         SimpleMsg_init(msg, msgType);
         return &msg->netMessage;
      }

      default:
      {
         SimpleMsg* msg = os_kmalloc(sizeof(*msg));
         SimpleMsg_init(msg, NETMSGTYPE_Invalid);
         return (NetMessage*)msg;
      }
   }
}

  • 创建目录响应消息的定义:
// fhgfs_client_module\source\common\net\message\storage\creating\MkDirRespMsg.h

struct MkDirRespMsg;
typedef struct MkDirRespMsg MkDirRespMsg;

static inline void MkDirRespMsg_init(MkDirRespMsg* this);

// virtual functions
extern bool MkDirRespMsg_deserializePayload(NetMessage* this, DeserializeCtx* ctx);

// getters & setters
static inline int MkDirRespMsg_getResult(MkDirRespMsg* this);
static inline const EntryInfo* MkDirRespMsg_getEntryInfo(MkDirRespMsg* this);


struct MkDirRespMsg
{
   NetMessage netMessage;

   int result;

   // for deserialization
   EntryInfo entryInfo;
};

extern const struct NetMessageOps MkDirRespMsg_Ops;

void MkDirRespMsg_init(MkDirRespMsg* this)
{
   NetMessage_init(&this->netMessage, NETMSGTYPE_MkDirResp, &MkDirRespMsg_Ops);
}

int MkDirRespMsg_getResult(MkDirRespMsg* this)
{
   return this->result;
}

const EntryInfo* MkDirRespMsg_getEntryInfo(MkDirRespMsg* this)
{
   return &this->entryInfo;
}

元数据服务

目录创建类(MkDirMsgEx)

  • 元数据服务中,创建目录的消息处理类:
// fhgfs_meta\source\net\message\storage\creating\MkDirMsgEx.h

class MkDirMsgEx : public MirroredMessage<MkDirMsg, std::tuple<HashDirLock, DirIDLock, ParentNameLock>>
{
   public:
      typedef ErrorAndEntryResponseState<MkDirRespMsg, NETMSGTYPE_MkDir> ResponseState;

      virtual bool processIncoming(ResponseContext& ctx) override;

      std::tuple<HashDirLock, DirIDLock, ParentNameLock> lock(EntryLockStore& store) override;

      std::unique_ptr<MirroredMessageResponseState> executeLocally(ResponseContext& ctx,
         bool isSecondary) override;

      bool isMirrored() override { return getParentInfo()->getIsBuddyMirrored(); }

   private:
      std::string entryID;

      std::unique_ptr<ResponseState> mkDirPrimary(ResponseContext& ctx);
      std::unique_ptr<ResponseState> mkDirSecondary();

      FhgfsOpsErr mkDirDentry(DirInode& parentDir, const std::string& name,
         const EntryInfo* entryInfo, const bool isBuddyMirrored);

      FhgfsOpsErr mkRemoteDirInode(DirInode& parentDir, const std::string& name,
         EntryInfo* entryInfo, const CharVector& defaultACLXAttr, const CharVector& accessACLXAttr);
      FhgfsOpsErr mkRemoteDirCompensate(EntryInfo* entryInfo);

      bool forwardToSecondary(ResponseContext& ctx) override;

      FhgfsOpsErr processSecondaryResponse(NetMessage& resp) override
      {
         return (FhgfsOpsErr) static_cast<MkDirRespMsg&>(resp).getResult();
      }

      const char* mirrorLogContext() const override { return "MkDirMsgEx/forward"; }

      EntryInfo newEntryInfo;
};

消息基类处理(MirroredMessage)

  • 基类消息会进行BuddyMirror的同步等所有消息处理类共有的功能:
// fhgfs_meta\source\net\message\MirroredMessage.h

template<typename BaseT, typename LockStateT>
class MirroredMessage : public BaseT
{
   template<typename T1, typename T2>
   friend class MirroredMessage;

      virtual bool processIncoming(NetMessage::ResponseContext& ctx)
      {
         Session* session = nullptr;
         bool isNewState = true;

         if (isMirrored() && !this->hasFlag(NetMessageHeader::Flag_BuddyMirrorSecond))
         {
            if (Program::getApp()->getInternodeSyncer()->getResyncInProgress())
               resyncJob = Program::getApp()->getBuddyResyncer()->getResyncJob();

            lockState = lock(*Program::getApp()->getMirroredSessions()->getEntryLockStore());
         }

         // make sure that the thread change set is *always* cleared when we leave this method.
         struct _ClearChangeSet {
            ~_ClearChangeSet()
            {
               if (BuddyResyncer::getSyncChangeset())
               {
                  LOG(WARNING, "Abandoning sync changeset");
                  BuddyResyncer::abandonSyncChangeset();
               }
            }
         } _clearChangeSet;
         (void) _clearChangeSet;

         mirrorState.reset();
         if (isMirrored())
         {
            const auto nodeID = this->getRequestorID(ctx).second;
            session = Program::getApp()->getMirroredSessions()->referenceSession(nodeID, true);
         }

         if (isMirrored() && this->hasFlag(NetMessageHeader::Flag_HasSequenceNumber))
         {
            // special case: client has not been told where to start its sequence. in this case,
            // we want to answer with only the new seqNoBase for the client, and do NO processing.
            if (this->getSequenceNumber() == 0)
            {
               GenericResponseMsg response(GenericRespMsgCode_NEWSEQNOBASE, "New seqNoBase");

               response.addFlag(NetMessageHeader::Flag_HasSequenceNumber);
               response.setSequenceNumber(session->getSeqNoBase());
               ctx.sendResponse(response);
               goto exit;
            }

            // a note on locking of mirrorState. since clients process each request in only one
            // thread, per client we can have only one request for a given sequence number at any
            // given time. retries may reuse the same sequence number, and they may be processed in
            // a different thread on the server, but no two threads process the same sequence number
            // from the same client at the same time. thus, no locking for the actual structure is
            // needed, but extra memory barriers to ensure propagation of results between threads
            // are necessary.
            __sync_synchronize();
            if (this->hasFlag(NetMessageHeader::Flag_IsSelectiveAck))
               std::tie(mirrorState, isNewState) = session->acquireMirrorStateSlotSelective(
                     this->getSequenceNumberDone(),
                     this->getSequenceNumber());
            else
               std::tie(mirrorState, isNewState) = session->acquireMirrorStateSlot(
                     this->getSequenceNumberDone(),
                     this->getSequenceNumber());
         }

         if (!isNewState)
         {
            if (mirrorState->response)
               mirrorState->response->sendResponse(ctx);
            else
               ctx.sendResponse(
                     GenericResponseMsg(
                        GenericRespMsgCode_TRYAGAIN,
                        "Request for same sequence number is currently in progress"));
         }
         else
         {
            if (resyncJob && resyncJob->isRunning())
               BuddyResyncer::registerSyncChangeset();

            auto responseState = executeLocally(ctx,
               isMirrored() && this->hasFlag(NetMessageHeader::Flag_BuddyMirrorSecond));

            // responseState may ne null if the message has called earlyComplete(). do not finish
            // the operation twice in this case.
            if (responseState)
               finishOperation(ctx, std::move(responseState));
         }

      exit:
         if (session)
            Program::getApp()->getMirroredSessions()->releaseSession(session);

         return true;
      }

      void finishOperation(NetMessage::ResponseContext& ctx,
         std::unique_ptr<MirroredMessageResponseState> state)
      {
         auto* responsePtr = state.get();
         bool buddyCommSuccessful = true;

         if (isMirrored() &&
               !this->hasFlag(NetMessageHeader::Flag_BuddyMirrorSecond) &&
               state)
         {
            if (state->changesObservableState())
               buddyCommSuccessful = forwardToSecondary(ctx);
            else
               buddyCommSuccessful = notifySecondaryOfACK(ctx);
         }

         if (mirrorState)
            mirrorState->response = std::move(state);

         // pairs with the memory barrier before acquireMirrorStateSlot
         __sync_synchronize();

         if (BuddyResyncer::getSyncChangeset())
         {
            if (isMirrored() &&
                  !this->hasFlag(NetMessageHeader::Flag_BuddyMirrorSecond) &&
                  responsePtr &&
                  responsePtr->changesObservableState())
               BuddyResyncer::commitThreadChangeSet();
            else
               BuddyResyncer::abandonSyncChangeset();
         }

         if (responsePtr && buddyCommSuccessful)
            responsePtr->sendResponse(ctx);
         else if (!buddyCommSuccessful)
            ctx.sendResponse(
                  GenericResponseMsg(
                     GenericRespMsgCode_INDIRECTCOMMERR_NOTAGAIN,
                     "Communication with secondary failed"));

         lockState = {};
      }

      bool notifySecondaryOfACK(NetMessage::ResponseContext& ctx)
      {
         AckNotifiyMsg msg;
         // if the secondary does not respond with SUCCESS, it will automatically be set to
         // needs-resync. eventually, resync will clear the secondary sessions entirely, which will
         // also flush the sequence number store.
         return sendToSecondary(ctx, msg, NETMSGTYPE_AckNotifyResp) == FhgfsOpsErr_SUCCESS;
      }

      template<typename T>
      FhgfsOpsErr sendToSecondary(NetMessage::ResponseContext& ctx, MirroredMessageBase<T>& message,
         unsigned respType, FhgfsOpsErr expectedResult = FhgfsOpsErr_SUCCESS)
      {
         App* app = Program::getApp();
         NodeStoreServers* metaNodes = app->getMetaNodes();
         MirrorBuddyGroupMapper* buddyGroups = app->getMetaBuddyGroupMapper();

         DEBUG_ENV_VAR(unsigned, FORWARD_DELAY, 0, "BEEGFS_FORWARD_DELAY_SECS");

         if (FORWARD_DELAY)
            sleep(FORWARD_DELAY);

         // if a resync is currently running, abort right here, immediatly. we do not need to know
         // the exact state of the buddy: a resync is running. it's bad.
         if (app->getInternodeSyncer()->getResyncInProgress())
            return FhgfsOpsErr_SUCCESS;

         // check whether the secondary is viable at all: if it is not online and good,
         // communicating will not do any good. even online/needs-resync must be skipped, because
         // the resyncer must be the only entitity that changes the secondary as long as it is not
         // good yet.
         {
            CombinedTargetState secondaryState;
            NumNodeID secondaryID(buddyGroups->getSecondaryTargetID(
                  buddyGroups->getLocalGroupID()));

            bool getStateRes = app->getMetaStateStore()->getState(secondaryID.val(),
                  secondaryState);

            // if the secondary is anything except online/good, set it to needs-resync immediately.
            // whenever we pass this point, the secondary will have missed *something* of
            // importance, so anything except online/good must be set to needs-resync right here.
            if (!getStateRes
                  || secondaryState.reachabilityState != TargetReachabilityState_ONLINE
                  || secondaryState.consistencyState != TargetConsistencyState_GOOD)
            {
               auto* const resyncer = app->getBuddyResyncer();
               auto* const job = resyncer->getResyncJob();

               // if we have no job or a running job, we must start a resync soon. if we have a
               // job that has finished successfully, the management server may not have noticed
               // that the secondary is completely resynced, so our buddys state may well not be
               // GOOD even though we have resynced completely. we may assume that a successful
               // resync implies that the buddy is good, even if the management server thinks it
               // isn't.
               if (!job ||
                     (!job->isRunning() && job->getState() != BuddyResyncJobState_SUCCESS))
               {
                  setBuddyNeedsResync();
                  return FhgfsOpsErr_SUCCESS;
               }
            }
         }

         RequestResponseArgs rrArgs(NULL, &message, respType);
         RequestResponseNode rrNode(NumNodeID(buddyGroups->getLocalGroupID()), metaNodes);

         rrNode.setMirrorInfo(buddyGroups, true);
         rrNode.setTargetStates(app->getMetaStateStore());

         prepareMirrorRequestArgs(rrArgs);

         // copy sequence numbers and set original requestor info for secondary
         message.setSequenceNumber(this->getSequenceNumber());
         message.setSequenceNumberDone(this->getSequenceNumberDone());
         message.setRequestorID(this->getRequestorID(ctx));
         // (almost) all messages do some sort of statistics gathering by user ID
         message.setMsgHeaderUserID(this->getMsgHeaderUserID());
         // set flag here instead of at the beginning because &message == this is often used
         message.addFlag(NetMessageHeader::Flag_BuddyMirrorSecond);
         message.addFlag(this->getFlags() & NetMessageHeader::Flag_IsSelectiveAck);
         message.addFlag(this->getFlags() & NetMessageHeader::Flag_HasSequenceNumber);

         FhgfsOpsErr commRes = MessagingTk::requestResponseNode(&rrNode, &rrArgs);

         message.removeFlag(NetMessageHeader::Flag_BuddyMirrorSecond);

         if (commRes != FhgfsOpsErr_SUCCESS)
         {
            // since we have reached this point, the secondary has indubitably not received
            // important information from the primary. we now have two choices to keep the system
            // in a consistent, safe state:
            //
            //  1) set the secondary to needs-resync
            //  2) rollback the modifications we have made and let the client retry, hoping that
            //     some future communication with the secondary is successful
            //
            // 2 is not a viable option: since some operations may move data off of this metadata
            // server and onto another one completely; allowing these to be undone requires a
            // two-phase commit protocol, which incurs large communication overhead for a
            // (hopefully) very rare error case. other operations delete local state (eg unlink,
            // or close of an unlinked file), which would have to be held in limbo until either a
            // commit or a rollback is issued.
            //
            // since we assume that communication errors are very rare, option 1 is the most
            // efficient in the general case (as it does not have to keep objects alive past their
            // intended lifetimes), so we set the secondary to needs-resync on any kind of
            // communication error.
            // other errors, e.g. out-of-memory conditions or errors caused by streamout hooks, are
            // also assumed to be rare. if any of these happens, the secondary must be resynced no
            // matter what actually happened. since the operations itself succeeded, we cannot send
            // a notification about the communication error either - we'd have to drop the operation
            // result to do that.

#ifdef BEEGFS_DEBUG
            // TODO use cached Buddy ID from new BuddyState class
            int buddyNodeID = buddyGroups->getBuddyTargetID(app->getLocalNodeNumID().val());

            LOG_CTX(DEBUG, mirrorLogContext(), "Communication with secondary failed. "
                  "Resync will be required when secondary comes back", buddyNodeID, commRes);
#endif
            setBuddyNeedsResync();

            return FhgfsOpsErr_SUCCESS;
         }

         FhgfsOpsErr respMsgRes = processSecondaryResponse(*rrArgs.outRespMsg);

         if (respMsgRes != expectedResult)
         {
            // whoops; primary and secondary did different things; if secondary is not resyncing
            // AND communication was good this is concerning (result must have been success on
            // primary, otherwise no forwarding would have happened).
            // usually, this would mean that primary and secondary do not have the same state, or
            // that the secondary has some kind of system error. (if the primary had a system error,
            // it would be more likely to fail than to succeed).
            // in either case, the secondary should be resynced, even if the primary experienced
            // a hardware fault or similar errors: at this point, we can no longer differentiate
            // between good and bad state on the primary, and the secondary may be arbitrarily out
            // of sync.
            LOG_CTX(NOTICE, mirrorLogContext(),
                  "Different return codes from primary and secondary buddy. "
                  "Setting secondary to needs-resync.",
                  as("Expected response", FhgfsOpsErrTk::toErrString(expectedResult)),
                  as("Received response", FhgfsOpsErrTk::toErrString(respMsgRes)));
            setBuddyNeedsResync();
         }

         return FhgfsOpsErr_SUCCESS;
      }
}

根目录DirEntry获取

  • 元数据服务在初始化时,就会在本地初始化根目录的DirEntry,由于根目录的EntryID就是固定的“root”,所以只需要去固定的路径读取就行了:
// fhgfs_common\source\common\storage\Metadata.h

#define META_ROOTDIR_ID_STR            "root" /* initial file system entry point */


// fhgfs_meta\source\app\App.cpp

void App::initRootDir(NumNodeID localNodeNumID)
{
   // try to load root dir from disk (through metaStore) or create a new one

   this->metaStore = new MetaStore();

   // try to reference root directory with buddy mirroring
   rootDir = this->metaStore->referenceDir(META_ROOTDIR_ID_STR, true, true);

   // if that didn't work try to reference non-buddy-mirrored root dir
   if (!rootDir)
   {
      rootDir = this->metaStore->referenceDir(META_ROOTDIR_ID_STR, false, true);
   }

   if(rootDir)
   { // loading succeeded (either with or without mirroring => init rootNodeID
      this->log->log(Log_NOTICE, "Root directory loaded.");

      NumNodeID rootDirOwner = rootDir->getOwnerNodeID();
      bool rootIsBuddyMirrored = rootDir->getIsBuddyMirrored();

      // try to set rootDirOwner as root node
      if((rootDirOwner  != 0) && metaNodes->setRootNodeNumID(rootDirOwner, false,
         rootIsBuddyMirrored) )
      { // new root node accepted (check if rootNode is localNode)
         NumNodeID primaryRootDirOwner;
         if (rootIsBuddyMirrored)
            primaryRootDirOwner = NumNodeID(
               metaBuddyGroupMapper->getPrimaryTargetID(rootDirOwner.val() ) );
         else
            primaryRootDirOwner = rootDirOwner;

         if(localNodeNumID == primaryRootDirOwner)
         {
            log->log(Log_CRITICAL, "I got root (by possession of root directory)");
            if (rootIsBuddyMirrored)
               log->log(Log_CRITICAL, "Root directory is mirrored");
         }
         else
            log->log(Log_CRITICAL,
               "Root metadata server (by possession of root directory): " + rootDirOwner.str());
      }
   }
   else
   { // failed to load root directory => create a new rootDir (not mirrored)
      this->log->log(Log_CRITICAL,
         "This appears to be a new storage directory. Creating a new root dir.");

      UInt16Vector stripeTargets;
      unsigned defaultChunkSize = this->cfg->getTuneDefaultChunkSize();
      unsigned defaultNumStripeTargets = this->cfg->getTuneDefaultNumStripeTargets();
      Raid0Pattern stripePattern(defaultChunkSize, stripeTargets, defaultNumStripeTargets);

      DirInode newRootDir(META_ROOTDIR_ID_STR,
         S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO, 0, 0, NumNodeID(), stripePattern, false);

      this->metaStore->makeDirInode(newRootDir);
      this->rootDir = this->metaStore->referenceDir(META_ROOTDIR_ID_STR, false, true);

      if(!this->rootDir)
      { // error
         this->log->logErr("Failed to store root directory. Unable to proceed.");
         throw InvalidConfigException("Failed to store root directory");
      }
   }

}

创建目录消息处理

  • 重载基类的消息处理虚函数:
// fhgfs_meta\source\net\message\storage\creating\MkDirMsgEx.cpp

bool MkDirMsgEx::processIncoming(ResponseContext& ctx)
{
   #ifdef BEEGFS_DEBUG
      const char* logContext = "MkDirMsg incoming";

      LOG_DEBUG(logContext, Log_DEBUG, "Received a MkDirMsg from: " + ctx.peerName() );
   #endif // BEEGFS_DEBUG

   App* app = Program::getApp();

   entryID = StorageTk::generateFileID(app->getLocalNode().getNumID());

   BaseType::processIncoming(ctx);

   // update operation counters
   app->getNodeOpStats()->updateNodeOp(ctx.getSocket()->getPeerIP(), MetaOpCounter_MKDIR,
      getMsgHeaderUserID() );

   return true;
}
生成目录文件ID
  • 目录的ID名称有3各部分组成,创建序号、时间戳和NodeID:
// fhgfs_common\source\common\toolkit\StorageTk.h
class StorageTk
{
      /**
       * Generate ID for new fs entry (i.e. file or dir).
       */
      static std::string generateFileID(const NumNodeID localNodeID)
      {
         /* note: we assume here that the clock doesn't jump backwards between restarts of
          the daemon (and that there always is at least one second between restarts) and that we
          don't need more than 2^32 IDs per second (sustained) */

         uint64_t nextID = idCounter.increase();

         // note on idCounter value: high 32bits are timestamp, low 32bits are sequential counter

         /* note on switching time/counter in string representation: having the timestamp first is
          bad for strcmp() and such things, which the underlying fs might need to do - because in
          that order, the first characters of entryIDs/filenames would always be similar. */

         uint32_t counterPart = (uint32_t) nextID;
         uint32_t timestampPart = (uint32_t) (nextID >> STORAGETK_FILEID_TIMESTAMP_SHIFTBITS);

         return StringTk::uintToHexStr(counterPart) + "-" + StringTk::uintToHexStr(timestampPart)
            + "-" + localNodeID.strHex();
      }
}
开始创建目录文件
  • 正是开始创建目录元数据文件,会根据BuddyMirror的主、备进行区分:
// fhgfs_meta\source\net\message\storage\creating\MkDirMsgEx.cpp

std::unique_ptr<MirroredMessageResponseState> MkDirMsgEx::executeLocally(ResponseContext& ctx,
   bool isSecondary)
{
   auto result = isSecondary
      ? mkDirSecondary()
      : mkDirPrimary(ctx);

   if (result && result->getResult() != FhgfsOpsErr_SUCCESS)
      LOG_DBG(DEBUG, "Failed to create directory",
            as("parentID", getParentInfo()->getEntryID()),
            as("newDirName", getNewDirName()),
            as("error", FhgfsOpsErrTk::toErrString(result->getResult())));

   return std::move(result);
}

创建主节点目录文件
  • 如果是主节点,则
// fhgfs_meta\source\net\message\storage\creating\MkDirMsgEx.cpp

std::unique_ptr<MkDirMsgEx::ResponseState> MkDirMsgEx::mkDirPrimary(ResponseContext& ctx)
{
   const char* logContext = "MkDirMsg (mkDirPrimary)";

   App* app =  Program::getApp();
   ModificationEventFlusher* modEventFlusher = app->getModificationEventFlusher();
   const bool modEventLoggingEnabled = modEventFlusher->isLoggingEnabled();
   MetaStore* metaStore = app->getMetaStore();
   Config* config = app->getConfig();
   MirrorBuddyGroupMapper* metaBuddyGroupMapper = app->getMetaBuddyGroupMapper();
   NodeCapacityPools* metaCapacityPools;
   NumNodeID expectedOwnerID;

   const EntryInfo* const parentInfo = getParentInfo();
   const std::string& newName = getNewDirName();

   FhgfsOpsErr retVal;

   // not a good idea to use scoped locks here, because we don't have a well-defined scope with
   // only buddy mirrored paths here; directly use entrylockstore
   EntryLockStore* entryLockStore = Program::getApp()->getSessions()->getEntryLockStore();

   // reference parent
   DirInode* parentDir = metaStore->referenceDir(parentInfo->getEntryID(),
      parentInfo->getIsBuddyMirrored(), true);
   if(!parentDir)
      return boost::make_unique<ResponseState>(FhgfsOpsErr_PATHNOTEXISTS, EntryInfo());

   // check whether localNode owns this (parent) directory
   NumNodeID localNodeID = app->getLocalNodeNumID();
   bool isBuddyMirrored = parentDir->getIsBuddyMirrored()
      && !isMsgHeaderFeatureFlagSet(MKDIRMSG_FLAG_NOMIRROR);

   // check whether localNode owns this (parent) directory; if parentDir is buddy mirrored compare
   // ownership to buddy group id, otherwise to node id itself
   if (parentDir->getIsBuddyMirrored())
      expectedOwnerID = NumNodeID(metaBuddyGroupMapper->getLocalGroupID() );
   else
      expectedOwnerID = localNodeID;

   if (isBuddyMirrored)
      metaCapacityPools = app->getMetaBuddyCapacityPools();
   else
      metaCapacityPools = app->getMetaCapacityPools();

   if(parentDir->getOwnerNodeID() != expectedOwnerID)
   { // this node doesn't own the parent dir
      LogContext(logContext).logErr(std::string("Dir-owner mismatch: \"") +
         parentDir->getOwnerNodeID().str()  + "\" vs. \"" +
         expectedOwnerID.str() + "\"");
      metaStore->releaseDir(parentInfo->getEntryID() );
      return boost::make_unique<ResponseState>(FhgfsOpsErr_NOTOWNER, EntryInfo());
   }

   // choose new directory owner...
   unsigned numDesiredTargets = 1;
   unsigned minNumRequiredTargets = numDesiredTargets;
   UInt16Vector newOwnerNodes;

   metaCapacityPools->chooseStorageTargets(numDesiredTargets, minNumRequiredTargets,
      &getPreferredNodes(), &newOwnerNodes);

   if(unlikely(newOwnerNodes.size() < minNumRequiredTargets) )
   { // (might be caused by a bad list of preferred targets)
      LogContext(logContext).logErr("No metadata servers available for new directory: " + newName);

      metaStore->releaseDir(parentInfo->getEntryID() );
      // we know that *some* metadata server must exist, since we are obviously active when we get
      // here. most likely a client has received a pool update before we have, or we have been
      // switched from secondary to primary and haven't been set to Good yet.
      // if preferred nodes have been set (currently only done by ctl), those may also be registered
      // as unavailable at the current time.
      // have the client retry until things work out.
      return boost::make_unique<ResponseState>(FhgfsOpsErr_COMMUNICATION, EntryInfo());
   }

   const uint16_t ownerNodeID = newOwnerNodes[0];
   const std::string parentEntryID = parentInfo->getEntryID();
   int entryInfoFlags = isBuddyMirrored ? ENTRYINFO_FEATURE_BUDDYMIRRORED : 0;

   int mode = getMode();
   const int umask = getUmask();
   CharVector parentDefaultACLXAttr;
   CharVector accessACLXAttr;

   if (config->getStoreClientACLs())
   {
      // Determine the ACLs of the new directory.
      PosixACL parentDefaultACL;
      bool needsACL;
      FhgfsOpsErr parentDefaultACLRes;

      std::tie(parentDefaultACLRes, parentDefaultACLXAttr, std::ignore) = parentDir->getXAttr(
         nullptr, PosixACL::defaultACLXAttrName, XATTR_SIZE_MAX);

      if (parentDefaultACLRes == FhgfsOpsErr_SUCCESS)
      {
         // parent has a default ACL
         if (!parentDefaultACL.deserializeXAttr(parentDefaultACLXAttr))
         {
            LogContext(logContext).log(Log_ERR,
               "Error deserializing directory default ACL for directory ID " + parentDir->getID());
            retVal = FhgfsOpsErr_INTERNAL;
            goto clean_up;
         }

         if (!parentDefaultACL.empty())
         {
            // Note: This modifies the mode bits as well as the ACL itself.
            FhgfsOpsErr modeRes = parentDefaultACL.modifyModeBits(mode, needsACL);
            setMode(mode, 0);

            if (modeRes != FhgfsOpsErr_SUCCESS)
            {
               LogContext(logContext).log(Log_ERR, "Error generating access ACL for new directory "
                     + newName);
               retVal = FhgfsOpsErr_INTERNAL;
               goto clean_up;
            }

            if (needsACL)
               parentDefaultACL.serializeXAttr(accessACLXAttr);
         }
         else
         {
            // On empty ACL, clear the Xattr, so it doesn't get set on the newly created dir
            parentDefaultACLXAttr.clear();
         }
      }

      if (parentDefaultACLRes == FhgfsOpsErr_NODATA
            || (parentDefaultACLRes == FhgfsOpsErr_SUCCESS && parentDefaultACL.empty()))
      {
         // containing dir has no ACL, so we can continue without one.
         mode &= ~umask;
         setMode(mode, umask);
      }

      if (parentDefaultACLRes != FhgfsOpsErr_SUCCESS && parentDefaultACLRes != FhgfsOpsErr_NODATA)
      {
         LogContext(logContext).log(Log_ERR,
            "Error loading default ACL for directory ID " + parentDir->getID() );
         retVal = parentDefaultACLRes;
         goto clean_up;
      }
   }

   newEntryInfo.set(NumNodeID(ownerNodeID), parentEntryID, entryID, newName,
      DirEntryType_DIRECTORY, entryInfoFlags);

   // create remote dir metadata
   // (we create this before the dentry to reduce the risk of dangling dentries)
   retVal = mkRemoteDirInode(*parentDir, newName, &newEntryInfo, parentDefaultACLXAttr,
      accessACLXAttr);

   if ( likely(retVal == FhgfsOpsErr_SUCCESS) )
   { // remote dir created => create dentry in parent dir
      // note: we can't lock before this point, because mkRemoteDirInode will also send a message
      // that needs to aquire a lock on the ID
      FileIDLock lock;

      if (parentInfo->getIsBuddyMirrored())
         lock = {entryLockStore, entryID};

      retVal = mkDirDentry(*parentDir, newName, &newEntryInfo, isBuddyMirrored);

      if ( retVal != FhgfsOpsErr_SUCCESS )
      { // error (or maybe name just existed already) => compensate metaDir creation
         // note: unlock needs to happen before a possible remoteDirCompensation, because rmDir
         // will also need to lock entryID
         lock = {};
         mkRemoteDirCompensate(&newEntryInfo);
      }
      else
      {
         // on success, complete early to unlock the ParentNameLock right here. we have to do this
         // here because adding a modification event for the created directory may block for a long
         // time
         earlyComplete(ctx, ResponseState(retVal, newEntryInfo));
         metaStore->releaseDir(parentInfo->getEntryID());
         return {};
      }
   }

clean_up:
   metaStore->releaseDir(parentInfo->getEntryID() );

   if (modEventLoggingEnabled)
      modEventFlusher->add(ModificationEvent_DIRCREATED, newEntryInfo.getEntryID());

   return boost::make_unique<ResponseState>(retVal, std::move(newEntryInfo));
}

创建备节点目录文件
// fhgfs_meta\source\net\message\storage\creating\MkDirMsgEx.cpp

std::unique_ptr<MkDirMsgEx::ResponseState> MkDirMsgEx::mkDirSecondary()
{
   MetaStore* metaStore = Program::getApp()->getMetaStore();

   // only create the dentry here; forwarding of inode creation directly happens in MkLocalFileMsg
   // and error handling and compensation is done by primary
   FhgfsOpsErr retVal;

   // reference parent
   DirInode* parentDir = metaStore->referenceDir(getParentInfo()->getEntryID(),
      getParentInfo()->getIsBuddyMirrored(), true);
   if(!parentDir)
      return boost::make_unique<ResponseState>(FhgfsOpsErr_PATHNOTEXISTS, EntryInfo());

   retVal = mkDirDentry(*parentDir, getCreatedEntryInfo()->getFileName(), getCreatedEntryInfo(),
      getCreatedEntryInfo()->getIsBuddyMirrored());

   metaStore->releaseDir(getParentInfo()->getEntryID());

   return boost::make_unique<ResponseState>(retVal, *getCreatedEntryInfo());
}
更新父目录DirInode
// fhgfs_meta\source\net\message\storage\creating\MkDirMsgEx.cpp

FhgfsOpsErr MkDirMsgEx::mkDirDentry(DirInode& parentDir, const std::string& name,
   const EntryInfo* entryInfo, const bool isBuddyMirrored)
{
   const std::string entryID     = entryInfo->getEntryID();
   const NumNodeID ownerNodeID    = entryInfo->getOwnerNodeID();

   DirEntry newDirDentry(DirEntryType_DIRECTORY, name, entryID, ownerNodeID);

   if(isBuddyMirrored)
      newDirDentry.setBuddyMirrorFeatureFlag();

   const FhgfsOpsErr mkRes = parentDir.makeDirEntry(newDirDentry);

   if (mkRes == FhgfsOpsErr_SUCCESS && shouldFixTimestamps())
      fixInodeTimestamp(parentDir, parentTimestamps);

   return mkRes;
}

// fhgfs_meta\source\storage\DirInode.cpp
/**
 * Create new directory entry in the directory given by DirInode
 */
FhgfsOpsErr DirInode::makeDirEntry(DirEntry& entry)
{
   SafeRWLock safeLock(&rwlock, SafeRWLock_WRITE); // L O C K

   // we always delete the entry from this method
   FhgfsOpsErr mkRes = makeDirEntryUnlocked(&entry, false);

   safeLock.unlock(); // U N L O C K

   return mkRes;
}


 * @param deleteEntry  shall we delete entry or does the caller still need it?
 */
FhgfsOpsErr DirInode::makeDirEntryUnlocked(DirEntry* entry, bool deleteEntry)
{
   FhgfsOpsErr mkRes = FhgfsOpsErr_INTERNAL;

   DirEntryType entryType = entry->getEntryType();
   if (unlikely( (!DirEntryType_ISFILE(entryType) && (!DirEntryType_ISDIR(entryType) ) ) ) )
      goto out;

   // load DirInode on demand if required, we need it now
   if (loadIfNotLoadedUnlocked() == false)
   {
      mkRes = FhgfsOpsErr_PATHNOTEXISTS;
      goto out;
   }

   mkRes = this->entries.makeEntry(entry);
   if(mkRes == FhgfsOpsErr_SUCCESS)
   { // entry successfully created

      if (DirEntryType_ISDIR(entryType) )
      {
         // update our own dirInode
         increaseNumSubDirsAndStoreOnDisk();
      }
      else
      {
         // update our own dirInode
         increaseNumFilesAndStoreOnDisk();
      }

   }

   if (getIsBuddyMirrored())
   {
      if (auto* resync = BuddyResyncer::getSyncChangeset())
      {
         const Path* inodePath = Program::getApp()->getBuddyMirrorInodesPath();
         std::string inodeFilename = MetaStorageTk::getMetaInodePath(inodePath->str(), id);
         resync->addModification(inodeFilename, MetaSyncFileType::Inode);
      }
   }

out:
   if (deleteEntry)
      delete entry;

   return mkRes;
}
加载父目录DirInode
/**
 * Load the DirInode from disk if it was notalready loaded before.
 *
 * @return true if loading not required or loading successfully, false if loading from disk failed.
 */
bool DirInode::loadIfNotLoadedUnlocked()
{
   if (!this->isLoaded)
   { // So the object was already created before without loading the inode from disk, do that now.
      bool loadSuccess = loadFromFile();
      if (!loadSuccess)
      {
         const char* logContext = "Loading DirInode on demand";
         std::string msg = "Loading DirInode failed dir-ID: ";
         LOG_DEBUG_CONTEXT(LogContext(logContext), Log_DEBUG, msg + this->id);
         IGNORE_UNUSED_VARIABLE(logContext);

         return false;
      }
   }

   return true;
}


/**
 * Note: Wrapper/chooser for loadFromFileXAttr/Contents.
 */
bool DirInode::loadFromFile()
{
   bool useXAttrs = Program::getApp()->getConfig()->getStoreUseExtendedAttribs();

   bool loadRes;
   if(useXAttrs)
      loadRes = loadFromFileXAttr();
   else
      loadRes = loadFromFileContents();

   if (loadRes)
      this->isLoaded = true;

   return loadRes;
}


/**
 * Note: Don't call this directly, use the wrapper loadFromFile().
 */
bool DirInode::loadFromFileXAttr()
{
   const char* logContext = "Directory (load from xattr file)";

   App* app = Program::getApp();

   const Path* inodePath =
      getIsBuddyMirrored() ? app->getBuddyMirrorInodesPath() : app->getInodesPath();
   std::string inodeFilename = MetaStorageTk::getMetaInodePath(inodePath->str(), id);

   bool retVal = false;

   char buf[META_SERBUF_SIZE];

   ssize_t getRes = getxattr(inodeFilename.c_str(), META_XATTR_NAME, buf, META_SERBUF_SIZE);
   if(getRes > 0)
   { // we got something => deserialize it
      Deserializer des(buf, getRes);
      DiskMetaData::deserializeDirInode(des, *this);
      if(unlikely(!des.good()))
      { // deserialization failed
         LogContext(logContext).logErr("Unable to deserialize metadata in file: " + inodeFilename);
         goto error_exit;
      }

      retVal = true;
   }
   else
   if( (getRes == -1) && (errno == ENOENT) )
   { // file not exists
      LOG_DEBUG(logContext, Log_DEBUG, "Metadata file not exists: " +
         inodeFilename + ". " + "SysErr: " + System::getErrString() );
   }
   else
   { // unhandled error
      LogContext(logContext).logErr("Unable to open/read xattr metadata file: " +
         inodeFilename + ". " + "SysErr: " + System::getErrString() );
   }


error_exit:

   return retVal;
}

/**
 * Note: Don't call this directly, use the wrapper loadFromFile().
 */
bool DirInode::loadFromFileContents()
{
   const char* logContext = "Directory (load from file)";

   App* app = Program::getApp();
   const Path* inodePath =
      getIsBuddyMirrored() ? app->getBuddyMirrorInodesPath() : app->getInodesPath();
   std::string inodeFilename = MetaStorageTk::getMetaInodePath(inodePath->str(), id);

   bool retVal = false;

   int openFlags = O_NOATIME | O_RDONLY;

   int fd = open(inodeFilename.c_str(), openFlags, 0);
   if(fd == -1)
   { // open failed
      if(errno != ENOENT)
         LogContext(logContext).logErr("Unable to open metadata file: " + inodeFilename +
            ". " + "SysErr: " + System::getErrString() );

      return false;
   }

   char buf[META_SERBUF_SIZE];
   int readRes = read(fd, buf, META_SERBUF_SIZE);
   if(readRes <= 0)
   { // reading failed
      LogContext(logContext).logErr("Unable to read metadata file: " + inodeFilename + ". " +
         "SysErr: " + System::getErrString() );
   }
   else
   {
      Deserializer des(buf, readRes);
      DiskMetaData::deserializeDirInode(des, *this);
      if (!des.good())
      { // deserialization failed
         LogContext(logContext).logErr("Unable to deserialize metadata in file: " + inodeFilename);
      }
      else
      { // success
         retVal = true;
      }
   }

   close(fd);

   return retVal;
}

创建子目录DirInode

发送创建DirInode消息
// fhgfs_meta\source\net\message\storage\creating\MkDirMsgEx.cpp

/**
 * Create dir inode on a remote server.
 *
 * @param name only used for logging
 * @param mirrorNodeID 0 for disabled mirroring
 */
FhgfsOpsErr MkDirMsgEx::mkRemoteDirInode(DirInode& parentDir, const std::string& name,
   EntryInfo* entryInfo, const CharVector& defaultACLXAttr, const CharVector& accessACLXAttr)
{
   const char* logContext = "MkDirMsg (mk dir inode)";

   App* app = Program::getApp();

   FhgfsOpsErr retVal = FhgfsOpsErr_SUCCESS;

   StripePattern* pattern = parentDir.getStripePatternClone();
   NumNodeID ownerNodeID = entryInfo->getOwnerNodeID();

   LOG_DEBUG(logContext, Log_DEBUG,
      "Creating dir inode at metadata node: " + ownerNodeID.str() + "; dirname: " + name);

   // prepare request

   NumNodeID parentNodeID = app->getLocalNode().getNumID();
   MkLocalDirMsg mkMsg(entryInfo, getUserID(), getGroupID(), getMode(), pattern, parentNodeID,
      defaultACLXAttr, accessACLXAttr);

   RequestResponseArgs rrArgs(NULL, &mkMsg, NETMSGTYPE_MkLocalDirResp);

   RequestResponseNode rrNode(ownerNodeID, app->getMetaNodes() );
   rrNode.setTargetStates(app->getMetaStateStore() );

   if(entryInfo->getIsBuddyMirrored())
      rrNode.setMirrorInfo(app->getMetaBuddyGroupMapper(), false);

   do // (this loop just exists to enable the "break"-jump, so it's not really a loop)
   {
      // send request to other mds and receive response

      FhgfsOpsErr requestRes = MessagingTk::requestResponseNode(&rrNode, &rrArgs);

      if(unlikely(requestRes != FhgfsOpsErr_SUCCESS) )
      { // communication error
         LogContext(logContext).log(Log_WARNING,
            "Communication with metadata server failed. "
            "nodeID: " + ownerNodeID.str() + "; " +
            "dirname: " + name);
         retVal = requestRes;
         break;
      }

      // correct response type received
      MkLocalDirRespMsg* mkRespMsg = (MkLocalDirRespMsg*)rrArgs.outRespMsg;

      FhgfsOpsErr mkRemoteInodeRes = mkRespMsg->getResult();
      if(mkRemoteInodeRes != FhgfsOpsErr_SUCCESS)
      { // error: remote dir inode not created
         LogContext(logContext).log(Log_WARNING,
            "Metadata server failed to create dir inode. "
            "nodeID: " + ownerNodeID.str() + "; " +
            "dirname: " + name);

         retVal = mkRemoteInodeRes;
         break;
      }

      // success: remote dir inode created
      LOG_DEBUG(logContext, Log_DEBUG,
         "Metadata server created dir inode. "
         "nodeID: " + ownerNodeID.str() + "; "
         "dirname: " + name);

   } while(0);


   delete(pattern);

   return retVal;
}
处理创建DirInode消息
// fhgfs_meta\source\net\message\storage\creating\MkLocalDirMsgEx.cpp

std::unique_ptr<MirroredMessageResponseState> MkLocalDirMsgEx::executeLocally(ResponseContext& ctx,
   bool isSecondary)
{
   App* app = Program::getApp();
   MetaStore* metaStore = app->getMetaStore();
   StripePattern& pattern = getPattern();

   EntryInfo *entryInfo = getEntryInfo();
   NumNodeID parentNodeID = getParentNodeID();

   NumNodeID ownerNodeID = entryInfo->getIsBuddyMirrored()
      ? NumNodeID(app->getMetaBuddyGroupMapper()->getLocalGroupID() )
      : app->getLocalNode().getNumID();

   DirInode newDir(entryInfo->getEntryID(), getMode(), getUserID(),
      getGroupID(), ownerNodeID, pattern, entryInfo->getIsBuddyMirrored());

   newDir.setParentInfoInitial(entryInfo->getParentEntryID(), parentNodeID);

   FhgfsOpsErr mkRes = metaStore->makeDirInode(newDir, getDefaultACLXAttr(), getAccessACLXAttr() );

   if (mkRes == FhgfsOpsErr_SUCCESS && shouldFixTimestamps())
      fixInodeTimestamp(newDir, dirTimestamps);

   return boost::make_unique<ResponseState>(mkRes);
}
开始创建DirInode
// fhgfs_meta\source\storage\MetaStore.cpp

FhgfsOpsErr MetaStore::makeDirInode(DirInode& inode, const CharVector& defaultACLXAttr,
   const CharVector& accessACLXAttr)
{
   UniqueRWLock lock(rwlock, SafeRWLock_READ);
   return dirStore.makeDirInode(inode, defaultACLXAttr, accessACLXAttr);
}

// fhgfs_meta\source\storage\InodeDirStore.cpp

/**
 * @param dir belongs to the store after calling this method - so do not free it and don't
 * use it any more afterwards (re-get it from this store if you need it)
 * @param defaultACLXAttr will be set as the posix_default_acl extended attribute
 * @param accessACLXAttr will be set as the access_default_acl extended attribute
 */
FhgfsOpsErr InodeDirStore::makeDirInode(DirInode& dir,
   const CharVector& defaultACLXAttr, const CharVector& accessACLXAttr)
{
   RWLockGuard lock(rwlock, SafeRWLock_WRITE);
   return makeDirInodeUnlocked(dir, defaultACLXAttr, accessACLXAttr);
}

/**
 * @param dir belongs to the store after calling this method - so do not free it and don't
 * use it any more afterwards (re-get it from this store if you need it)
 */
FhgfsOpsErr InodeDirStore::makeDirInodeUnlocked(DirInode& dir, const CharVector& defaultACLXAttr,
         const CharVector& accessACLXAttr)
{
   return dir.storePersistentMetaData(defaultACLXAttr, accessACLXAttr);
}
保存DirInode元数据
// fhgfs_meta\source\storage\DirInode.h

class DirInode
{
      FhgfsOpsErr storePersistentMetaData(const CharVector& defaultACLXAttr,
         const CharVector& accessACLXAttr)
      {
         return storeInitialMetaData(defaultACLXAttr, accessACLXAttr);
      }
}

// fhgfs_meta\source\storage\DirInode.cpp

FhgfsOpsErr DirInode::storeInitialMetaData(const CharVector& defaultACLXAttr,
         const CharVector& accessACLXAttr)
{
   FhgfsOpsErr res = storeInitialMetaData();

   if (res != FhgfsOpsErr_SUCCESS)
      return res;

   // xattr updates done here are also resynced if storeInitialMetaData enqueued the inode
   if (!defaultACLXAttr.empty())
      res = setXAttr(nullptr, PosixACL::defaultACLXAttrName, defaultACLXAttr, 0);

   if (res != FhgfsOpsErr_SUCCESS)
      return res;

   if (!accessACLXAttr.empty())
      res = setXAttr(nullptr, PosixACL::accessACLXAttrName, accessACLXAttr, 0);

   return res;
}

/*
 * Note: Current object state is used for the serialization.
 */
FhgfsOpsErr DirInode::storeInitialMetaData()
{
   FhgfsOpsErr dirRes = DirEntryStore::mkDentryStoreDir(this->id, this->getIsBuddyMirrored());
   if(dirRes != FhgfsOpsErr_SUCCESS)
      return dirRes;

   FhgfsOpsErr fileRes = storeInitialMetaDataInode();
   if(unlikely(fileRes != FhgfsOpsErr_SUCCESS) )
   {
      if (unlikely(fileRes == FhgfsOpsErr_EXISTS) )
      {
         // there must have been some kind of race as dirRes was successful
         fileRes = FhgfsOpsErr_SUCCESS;
      }
      else
         DirEntryStore::rmDirEntryStoreDir(this->id, this->getIsBuddyMirrored()); // remove dir
   }

   return fileRes;
}


/*
 * Creates the initial metadata inode for this directory.
 *
 * Note: Current object state is used for the serialization.
 */
FhgfsOpsErr DirInode::storeInitialMetaDataInode()
{
   const char* logContext = "Directory (store initial metadata file)";

   App* app = Program::getApp();
   const Path* inodePath =
      getIsBuddyMirrored() ? app->getBuddyMirrorInodesPath() : app->getInodesPath();
   std::string metaFilename = MetaStorageTk::getMetaInodePath(inodePath->str(), id);

   FhgfsOpsErr retVal = FhgfsOpsErr_SUCCESS;
   bool useXAttrs = app->getConfig()->getStoreUseExtendedAttribs();

   char buf[META_SERBUF_SIZE];
   Serializer ser(buf, sizeof(buf));

   // create file

   int openFlags = O_CREAT|O_EXCL|O_WRONLY;
   mode_t openMode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;

   int fd = open(metaFilename.c_str(), openFlags, openMode);
   if(fd == -1)
   { // error
      if(errno == EEXIST)
         retVal = FhgfsOpsErr_EXISTS;
      else
      {
         LogContext(logContext).logErr("Unable to create dir metadata inode " + metaFilename +
            ". " + "SysErr: " + System::getErrString() );
         retVal = FhgfsOpsErr_INTERNAL;
      }

      goto error_donothing;
   }

   // alloc buf and serialize

   DiskMetaData::serializeDirInode(ser, *this);
   if (!ser.good())
   {
      LOG(ERR, "Serialized metadata is larger than serialization buffer size.", id, parentDirID,
            metaFilename,
            as("Data size", ser.size()),
            as("Buffer size", META_SERBUF_SIZE));

      retVal = FhgfsOpsErr_INTERNAL;
      goto error_closefile;
   }

   // write data to file

   if(useXAttrs)
   { // extended attribute
      int setRes = fsetxattr(fd, META_XATTR_NAME, buf, ser.size(), 0);

      if(unlikely(setRes == -1) )
      { // error
         if(errno == ENOTSUP)
            LogContext(logContext).logErr("Unable to store directory xattr metadata: " +
               metaFilename + ". " +
               "Did you enable extended attributes (user_xattr) on the underlying file system?");
         else
            LogContext(logContext).logErr("Unable to store directory xattr metadata: " +
               metaFilename + ". " + "SysErr: " + System::getErrString() );

         retVal = FhgfsOpsErr_INTERNAL;

         goto error_closefile;
      }
   }
   else
   { // normal file content
      ssize_t writeRes = write(fd, buf, ser.size());

      if(writeRes != (ssize_t)ser.size())
      {
         LogContext(logContext).logErr("Unable to store directory metadata: " + metaFilename +
            ". " + "SysErr: " + System::getErrString() );
         retVal = FhgfsOpsErr_INTERNAL;

         goto error_closefile;
      }
   }

   close(fd);

   LOG_DEBUG(logContext, Log_DEBUG, "Directory metadata inode stored: " + metaFilename);

   if (getIsBuddyMirrored())
      if (auto* resync = BuddyResyncer::getSyncChangeset())
         resync->addModification(metaFilename, MetaSyncFileType::Inode);

   return retVal;


   // error compensation
error_closefile:
   close(fd);
   unlink(metaFilename.c_str() );

error_donothing:

   return retVal;
}
同步DirInode创建消息
// fhgfs_meta\source\components\buddyresyncer\SyncCandidate.h
class MetaSyncCandidateFile
{
      void addModification(std::string path, MetaSyncFileType type)
      {
         paths.push_back(Element{std::move(path), type, false});
      }
}

创建子目录DirEntry

目录项ID文件名生成
// fhgfs_common\source\common\storage\Metadata.h

#define META_ROOTDIR_ID_STR            "root" /* initial file system entry point */
#define META_DISPOSALDIR_ID_STR        "disposal" /* for unlinked but still open files */
#define META_MIRRORDISPOSALDIR_ID_STR  "mdisposal"

#define META_INODES_LEVEL1_SUBDIR_NUM  (128)
#define META_INODES_LEVEL2_SUBDIR_NUM  (128)
#define META_INODES_SUBDIR_NAME        "inodes"     /* contains local file system entry metadata */

#define META_DENTRIES_LEVEL1_SUBDIR_NUM   (128)
#define META_DENTRIES_LEVEL2_SUBDIR_NUM   (128)
#define META_DENTRIES_SUBDIR_NAME         "dentries"   /* contains file system link structure */

#define META_DIRENTRYID_SUB_STR        "#fSiDs#" /* subdir with entryIDs for ID based file access
                                                     * this is a bit dangerous, at it might
                                                     * conflict with user file names, so we need
                                                     * to chose a good name */


// fhgfs_common\source\common\toolkit\StorageTk.h
class StorageTk
{
      /**
       * @return path/hashDir1/hashDir2/fileName
       */
      static std::string getHashPath(const std::string path, const std::string entryID,
         size_t numHashesLevel1, size_t numHashesLevel2)
      {
         return path + "/" + getBaseHashPath(entryID, numHashesLevel1, numHashesLevel2);
      }

      /**
       * @return hashDir1/hashDir2/entryID
       */
      static std::string getBaseHashPath(const std::string entryID,
         size_t numHashesLevel1, size_t numHashesLevel2)
      {
         uint16_t hashLevel1;
         uint16_t hashLevel2;

         getHashes(entryID, numHashesLevel1, numHashesLevel2, hashLevel1, hashLevel2);

         return StringTk::uint16ToHexStr(hashLevel1) + "/" + StringTk::uint16ToHexStr(hashLevel2) +
            "/" + entryID;
      }

}

// fhgfs_common\source\common\toolkit\MetaStorageTk.h
class MetaStorageTk
{
      /**
       * Get path to the IDs subdir of a contents directory (i.e. the dir containing the dentries
       * by ID).
       *
       * @param metaDirEntryPath path to a contents dir (i.e. the dir containing dentries by name),
       * typically requires calling getMetaDirEntryPath() first
       */
      static std::string getMetaDirEntryIDPath(const std::string metaDirEntryPath)
      {
         return metaDirEntryPath + "/" META_DIRENTRYID_SUB_STR "/";
      }
};
开始创建子目录DirEntry
// fhgfs_meta\source\storage\DirInode.cpp

/**
 * @param file belongs to the store after calling this method - so do not free it and don't
 * use it any more afterwards (re-get it from this store if you need it)
 */
FhgfsOpsErr DirEntryStore::makeEntry(DirEntry* entry)
{
   FhgfsOpsErr mkRes;

   SafeRWLock safeLock(&rwlock, SafeRWLock_WRITE);

   mkRes = makeEntryUnlocked(entry);

   safeLock.unlock();

   return mkRes;
}

/**
 * @param file belongs to the store after calling this method - so do not free it and don't
 * use it any more afterwards (re-get it from this store if you need it)
 */
FhgfsOpsErr DirEntryStore::makeEntryUnlocked(DirEntry* entry)
{
   const std::string& dirEntryPath = getDirEntryPathUnlocked();
   const char* logContext = "make meta dir-entry";

   FhgfsOpsErr mkRes = entry->storeInitialDirEntry(dirEntryPath);

   if (unlikely(mkRes != FhgfsOpsErr_SUCCESS) && mkRes != FhgfsOpsErr_EXISTS)
      LogContext(logContext).logErr(std::string("Failed to create: name: ") + entry->getName() +
         std::string(" entryID: ") + entry->getID() + " in path: " + dirEntryPath);

   return mkRes;
}
// fhgfs_meta\source\storage\DirEntry.cpp

/**
 * Note: Must be called before any of the disk modifying methods
 * (otherwise they will fail)
 *
 * @param path does not include the filename
 */
FhgfsOpsErr DirEntry::storeInitialDirEntry(const std::string& dirEntryPath)
{
   const char* logContext = DIRENTRY_LOG_CONTEXT "(store initial dirEntry)";

   LOG_DEBUG(logContext, 4, "Storing initial dentry metadata for ID: '" + getEntryID() + "'");

   std::string idPath = MetaStorageTk::getMetaDirEntryIDPath(dirEntryPath) +  getEntryID();

   // first create the dirEntry-by-ID
   FhgfsOpsErr entryIdRes = this->storeInitialDirEntryID(logContext, idPath);

   if (entryIdRes != FhgfsOpsErr_SUCCESS)
      return entryIdRes;

   bool isDir = DirEntryType_ISDIR(getEntryType() );

   // eventually the dirEntry-by-name
   std::string namePath = dirEntryPath + '/' + this->name;
   FhgfsOpsErr result = this->storeInitialDirEntryName(logContext, idPath, namePath, isDir);

   if (result == FhgfsOpsErr_SUCCESS && getIsBuddyMirrored())
      if (auto* resync = BuddyResyncer::getSyncChangeset())
      {
         if (!isDir)
            resync->addModification(idPath, MetaSyncFileType::Inode);

         resync->addModification(namePath, MetaSyncFileType::Dentry);
      }

   return result;
}
创建DirEntryID文件
// fhgfs_meta\source\storage\DirEntry.cpp

/*
 * Store the dirEntryID file. This is a normal dirEntry (with inlined inode),
 * but the file name is the entryID.
 *
 * @param logContext
 * @param idPath - path to the idFile, including the file name
 */
FhgfsOpsErr DirEntry::storeInitialDirEntryID(const char* logContext, const std::string& idPath)
{
   FhgfsOpsErr retVal = FhgfsOpsErr_SUCCESS;

   char buf[DIRENTRY_SERBUF_SIZE];
   Serializer ser(buf, sizeof(buf));
   bool useXAttrs = Program::getApp()->getConfig()->getStoreUseExtendedAttribs();

   // create file

   /* note: if we ever think about switching to a rename-based version here, we must keep very
     long user file names in mind, which might lead to problems if we add an extension to the
     temporary file name. */

   int openFlags = O_CREAT|O_EXCL|O_WRONLY;

   int fd = open(idPath.c_str(), openFlags, 0644);
   if (unlikely (fd == -1) ) // this is our ID file, failing to create it is very unlikely
   { // error
      LogContext(logContext).logErr("Unable to create dentry file: " + idPath + ". " +
         "SysErr: " + System::getErrString() );

      if (errno == EMFILE)
      { /* Creating the file succeeded, but there are already too many open file descriptors to
         * open the file. We don't want to leak an entry-by-id file, so delete it.
         * We only want to delete the file for specific errors, as for example EEXIST would mean
         * we would delete an existing (probably) working entry. */
         int unlinkRes = unlink(idPath.c_str() );
         if (unlinkRes && errno != ENOENT)
            LogContext(logContext).logErr("Failed to unlink failed dentry: " + idPath + ". " +
               "SysErr: " + System::getErrString() );
      }

      if (errno == EEXIST)
      {
         /* EEXIST never should happen, as our ID is supposed to be unique, but there rare cases
          * as for the upgrade tool */
         retVal = FhgfsOpsErr_EXISTS;
         #ifdef BEEGFS_DEBUG
            LogContext(logContext).logBacktrace();
         #endif
      }
      else
      {
         retVal = FhgfsOpsErr_INTERNAL;
      }

      return retVal;
   }

   // serialize (to new buf)
   serializeDentry(ser);
   if (!ser.good())
   {
      LogContext(logContext).logErr("Dentry too large: " + idPath + ".");
      retVal = FhgfsOpsErr_INTERNAL;
   }

   // write buf to file

   if(useXAttrs)
   { // extended attribute
      int setRes = fsetxattr(fd, META_XATTR_NAME, buf, ser.size(), 0);

      if(unlikely(setRes == -1) )
      { // error
         LogContext(logContext).logErr("Unable to store dentry xattr metadata: " + idPath + ". " +
            "SysErr: " + System::getErrString() );
         retVal = FhgfsOpsErr_INTERNAL;
         goto error_closefile;
      }
   }
   else
   { // normal file content
      ssize_t writeRes = write(fd, buf, ser.size());

      if(unlikely(writeRes != (ssize_t)ser.size()))
      { // error
         LogContext(logContext).logErr("Unable to store dentry metadata: " + idPath + ". " +
            "SysErr: " + System::getErrString() );
         retVal = FhgfsOpsErr_INTERNAL;
         goto error_closefile;
      }
   }

   close(fd);

   return retVal;

   // error compensation
error_closefile:
   close(fd);

   int unlinkRes = unlink(idPath.c_str() );
   if (unlikely(unlinkRes && errno != ENOENT) )
   {
      LogContext(logContext).logErr("Creating the dentry-by-name file failed and"
         "now also deleting the dentry-by-id file fails: " + idPath);
   }

   return retVal;
}
创建DirEntryName文件
// fhgfs_meta\source\storage\DirEntry.cpp

/**
 * Store the dirEntry as file name
 */
FhgfsOpsErr DirEntry::storeInitialDirEntryName(const char* logContext, const std::string& idPath,
   const std::string& namePath, bool isDir)
{
   FhgfsOpsErr retVal = FhgfsOpsErr_SUCCESS;

   int linkRes = link(idPath.c_str(),  namePath.c_str() );
   if (linkRes)
   {  /* Creating the dirEntry-by-name failed, most likely this is EEXIST.
       * In principle it also might be possible there is an invalid dentry-by-name file,
       * however, we already want to delete those during lookup calls now. So invalid
       * entries are supposed to be very very unlikely and so no self-healing code is
       * implemented here. */

      if (likely(errno == EEXIST) )
         retVal = FhgfsOpsErr_EXISTS;
      else
      {
         LogContext(logContext).logErr("Creating the dentry-by-name file failed: Path: " +
            namePath + " SysErr: " + System::getErrString() );

         retVal = FhgfsOpsErr_INTERNAL;
      }

      int unlinkRes = unlink(idPath.c_str() );
      if (unlikely(unlinkRes) )
      {
         LogContext(logContext).logErr("Creating the dentry-by-name file failed and"
            "now also deleting the dentry-by-id file fails: " + idPath);
      }

      return retVal;
   }

   if (isDir)
   {
      // unlink the dentry-by-id file - we don't need it for dirs (or non-inlined inodes in general)
      int unlinkRes = unlink(idPath.c_str() );
      if (unlikely(unlinkRes) )
      {
         LogContext(logContext).logErr("Failed to unlink the (dir) dentry-by-id file "+ idPath +
            " SysErr: " + System::getErrString() );
      }
   }

   /* TODO: mkdir() might be a bit slow due to this 3-way operation (dentry-by-id, link-to-name,
    *       remove dentry-by-id. If it is too slow we might need to switch to a simple
    *       create-dentry-by-name, but which wouldn't set xattrs atomically from clients point of
    *       view.  */

   LOG_DEBUG(logContext, 4, "Initial dirEntry stored: " + namePath);

   return retVal;
}
同步DirEntry同步消息
// fhgfs_meta\source\storage\DirInode.cpp

/**
 * @param deleteEntry  shall we delete entry or does the caller still need it?
 */
FhgfsOpsErr DirInode::makeDirEntryUnlocked(DirEntry* entry, bool deleteEntry)
{
...

   if (getIsBuddyMirrored())
   {
      if (auto* resync = BuddyResyncer::getSyncChangeset())
      {
         const Path* inodePath = Program::getApp()->getBuddyMirrorInodesPath();
         std::string inodeFilename = MetaStorageTk::getMetaInodePath(inodePath->str(), id);
         resync->addModification(inodeFilename, MetaSyncFileType::Inode);
      }
   }

...
}
// fhgfs_meta\source\net\message\MirroredMessage.h
template<typename BaseT, typename LockStateT>
class MirroredMessage : public BaseT
{
...
         if (BuddyResyncer::getSyncChangeset())
         {
            if (isMirrored() &&
                  !this->hasFlag(NetMessageHeader::Flag_BuddyMirrorSecond) &&
                  responsePtr &&
                  responsePtr->changesObservableState())
               BuddyResyncer::commitThreadChangeSet();
            else
               BuddyResyncer::abandonSyncChangeset();
         }

         if (responsePtr && buddyCommSuccessful)
            responsePtr->sendResponse(ctx);
         else if (!buddyCommSuccessful)
            ctx.sendResponse(
                  GenericResponseMsg(
                     GenericRespMsgCode_INDIRECTCOMMERR_NOTAGAIN,
                     "Communication with secondary failed"));

         lockState = {};
      }
...
}


// fhgfs_meta\source\components\buddyresyncer\BuddyResyncer.cpp

void BuddyResyncer::commitThreadChangeSet()
{
   BEEGFS_BUG_ON(!currentThreadChangeSet, "no change set active");

   auto* job = Program::getApp()->getBuddyResyncer()->getResyncJob();

   std::unique_ptr<MetaSyncCandidateFile> candidate(currentThreadChangeSet);
   currentThreadChangeSet = nullptr;

   Barrier syncDone(2);

   candidate->prepareSignal(syncDone);

   job->enqueue(std::move(*candidate), PThread::getCurrentThread());
   syncDone.wait();
}

元数据序列化

DirInode序列化
// fhgfs_meta\source\storage\DiskMetaData.cpp

template<typename Inode, typename Ctx>
void DiskMetaData::serializeDirInodeCommonData(Inode& inode, Ctx& ctx)
{
   if (likely(inode.featureFlags & DIRINODE_FEATURE_EARLY_SUBDIRS))
      ctx % inode.numSubdirs;

   ctx % inode.statData.serializeAs(
         inode.featureFlags & DIRINODE_FEATURE_STATFLAGS
            ? StatDataFormat_DIRINODE
            : StatDataFormat_DIRINODE_NOFLAGS);

   if (unlikely(!(inode.featureFlags & DIRINODE_FEATURE_EARLY_SUBDIRS)))
      ctx % inode.numSubdirs;

   ctx
      % inode.numFiles
      % serdes::stringAlign4(inode.id)
      % serdes::stringAlign4(inode.parentDirID);
}

/*
 * Note: Current object state is used for the serialization
 */
void DiskMetaData::serializeDirInode(Serializer& ser, DirInode& inode)
{
   // note: the total amount of serialized data may not be larger than META_SERBUF_SIZE

   inode.featureFlags |= (DIRINODE_FEATURE_EARLY_SUBDIRS | DIRINODE_FEATURE_STATFLAGS);

   ser
      % uint8_t(DiskMetaDataType_DIRINODE)
      % uint8_t(DIRECTORY_STORAGE_FORMAT_32BIT)
      % inode.featureFlags;

   serializeDirInodeCommonData<const DirInode>(inode, ser);

   ser
      % inode.ownerNodeID
      % inode.parentNodeID
      % inode.stripePattern;
}
DirEntry序列化
// fhgfs_meta\source\storage\DirEntry.h
class DirEntry
{
      void serializeDentry(Serializer& ser)
      {
         DiskMetaData diskMetaData(&this->dentryDiskData, &this->inodeData);
         diskMetaData.serializeDentry(ser);
      }
}


// fhgfs_meta\source\storage\DiskMetaData.cpp

void DiskMetaData::serializeDentry(Serializer& ser)
{
   DiskMetaDataType metaDataType;

   if (DirEntryType_ISDIR(this->dentryDiskData->getDirEntryType() ) )
      metaDataType = DiskMetaDataType_DIRDENTRY;
   else
      metaDataType = DiskMetaDataType_FILEDENTRY;

   serializeInDentryFormat(ser, metaDataType);
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值