Lustre文件系统fid介绍

fid介绍

fid是lustre文件系统中文件的唯一标识,总共128位,fid序列、fid序列内编号、fid版本号(目前未使用默认为0)

/**
 * File IDentifier.
 *
 * FID is a cluster-wide unique identifier of a file or an object (stripe).
 * FIDs are never reused.
 **/
struct lu_fid {
       /**
	* FID sequence. Sequence is a unit of migration: all files (objects)
	* with FIDs from a given sequence are stored on the same server.
	* Lustre should support 2^64 objects, so even if each sequence
	* has only a single object we can still enumerate 2^64 objects.
	**/
    __u64 f_seq;
    /* FID number within sequence. */
    __u32 f_oid;
    /**
	 * FID version, used to distinguish different versions (in the sense
	 * of snapshots, etc.) of the same file system object. Not currently
	 * used.
	 **/
    __u32 f_ver;
};

fid获取流程

fld:fid location database
sequence controller: 运行在MDT0上,拥有全量的fld信息
sequence server:运行在MDT(非MDT0)和OST上,互相不会有重叠,是MDT0上fld的子集
sequence client:每个客户端在挂载文件系统时会提前申请一部分sequence,每个客户端拿到的sequence不会有重叠
管理fid范围的结构体:

/**
 * Describes a range of sequence, lsr_start is included but lsr_end is
 * not in the range.
 * Same structure is used in fld module where lsr_index field holds mdt id
 * of the home mdt.
 */
struct lu_seq_range {
    __u64 lsr_start; //序列号起始
    __u64 lsr_end;  //序列号结束
    __u32 lsr_index;
    __u32 lsr_flags;
};

在同一个MDT上创建的文件,如果序列号未使用完,则这些文件的序列号相同,fid序列内编号依次递增。

如果序列号使用完,则客户端会向服务端申请下一批序列号
例:假设同一客户端依次在MDT0上创建test1和test2,那么test1的fid为[0x20001:0x1:0x0],那么test2的fid为[0x20001:0x2:0x0]
在这里插入图片描述

fid申请流程

服务端初始化阶段

mdt:

//运行于mdt上的sequnce服务
static int mdt_seq_init(const struct lu_env *env, struct mdt_device *mdt)
{
    struct seq_server_site    *ss;
    int			rc;
    ENTRY;

    ss = mdt_seq_site(mdt);
    /* init sequence controller server(MDT0) */
    if (ss->ss_node_id == 0) {
		OBD_ALLOC_PTR(ss->ss_control_seq);
		if (ss->ss_control_seq == NULL)
			RETURN(-ENOMEM);
		//在mdt0上运行sequnce controller,分配seq范围给sequence server
		rc = seq_server_init(env, ss->ss_control_seq, mdt->mdt_bottom,
				     mdt_obd_name(mdt), LUSTRE_SEQ_CONTROLLER,
				     ss);
		if (rc)
			GOTO(out_seq_fini, rc);
    }

    /* Init normal sequence server */
    OBD_ALLOC_PTR(ss->ss_server_seq);
    if (ss->ss_server_seq == NULL)
		GOTO(out_seq_fini, rc = -ENOMEM);
    //其他的mdt会执行下面的代码,运行sequence server,给sequence server分配seq来构建fid
    rc = seq_server_init(env, ss->ss_server_seq, mdt->mdt_bottom,
			     mdt_obd_name(mdt), LUSTRE_SEQ_SERVER, ss);
    if (rc)
		GOTO(out_seq_fini, rc);

    /* init seq client for seq server to talk to seq controller(MDT0) */
    rc = mdt_seq_init_cli(env, mdt);
    if (rc != 0)
		GOTO(out_seq_fini, rc);

    if (ss->ss_node_id != 0)
		/* register controller export through lwp */
		rc = mdt_register_seq_exp(mdt);

    EXIT;
out_seq_fini:
    if (rc)
		mdt_seq_fini(env, mdt);

    return rc;
}

ost:

//运行于ost上的sequnce服务
int ofd_fid_init(const struct lu_env *env, struct ofd_device *ofd)
{
    struct seq_server_site *ss = &ofd->ofd_seq_site;
    struct lu_device *lu = &ofd->ofd_dt_dev.dd_lu_dev;
    char *obd_name = ofd_name(ofd);
    char *name = NULL;
    int len = strlen(obd_name) + 7;
    int rc = 0;

    ss = &ofd->ofd_seq_site;
    lu->ld_site->ld_seq_site = ss;
    ss->ss_lu = lu->ld_site;
    ss->ss_node_id = ofd->ofd_lut.lut_lsd.lsd_osd_index;

    OBD_ALLOC(name, len);
    if (name == NULL)
		return -ENOMEM;

    OBD_ALLOC_PTR(ss->ss_server_seq);
    if (ss->ss_server_seq == NULL)
		GOTO(out_name, rc = -ENOMEM);
    
    //在ost上运行sequence server
    rc = seq_server_init(env, ss->ss_server_seq, ofd->ofd_osd, obd_name,
			     LUSTRE_SEQ_SERVER, ss);
    if (rc) {
		CERROR("%s: seq server init error: rc = %d\n", obd_name, rc);
		GOTO(out_server, rc);
    }
    ss->ss_server_seq->lss_space.lsr_index = ss->ss_node_id;

    OBD_ALLOC_PTR(ss->ss_client_seq);
    if (ss->ss_client_seq == NULL)
		GOTO(out_server, rc = -ENOMEM);

    snprintf(name, len, "%s-super", obd_name);
    
    //初始化ost上的seq client
    rc = seq_client_init(ss->ss_client_seq, NULL, LUSTRE_SEQ_DATA,
			     name, NULL);
    if (rc) {
		CERROR("%s: seq client init error: rc = %d\n", obd_name, rc);
		GOTO(out_client, rc);
    }

    rc = seq_server_set_cli(env, ss->ss_server_seq, ss->ss_client_seq);

    if (rc) {
out_client:
		seq_client_fini(ss->ss_client_seq);
		OBD_FREE_PTR(ss->ss_client_seq);
		ss->ss_client_seq = NULL;
out_server:
		seq_server_fini(ss->ss_server_seq, env);
		OBD_FREE_PTR(ss->ss_server_seq);
		ss->ss_server_seq = NULL;
    }
out_name:
    OBD_FREE(name, len);

    return rc;
}

当客户端创建新文件时,会检查申请到的sequence是否够用,够用的话直接走本地分配fid,然后在向MDT发送创建请求时会将新分配的fid反馈给MDT,由MDT进行处理

当客户端本地的sequence不够用时,会向sequence server申请新的sequence,如果sequence server上的sequence也不够用了,那么sequence server会向sequence controller申请新的sequence,最终返回新的sequence给客户端。
在这里插入图片描述

seq cli <–> seq svr

当client中申请的seq使用完之后会向server申请新的seq

static int seq_client_alloc_seq(const struct lu_env *env,
				struct lu_client_seq *seq, u64 *seqnr)
{
	......
    // eq耗尽的话调用seq_client_alloc_meta()获取新的seq
	if (lu_seq_range_is_exhausted(&seq->lcs_space)) {
                rc = seq_client_alloc_meta(env, seq);
                if (rc) {
			if (rc != -EINPROGRESS)
				CERROR("%s: Can't allocate new meta-sequence,"
				       "rc = %d\n", seq->lcs_name, rc);
                        RETURN(rc);
                } else {
                        CDEBUG(D_INFO, "%s: New range - "DRANGE"\n",
                               seq->lcs_name, PRANGE(&seq->lcs_space));
                }
        } else {
                rc = 0;
        }
		......
        RETURN(rc);
}


static int seq_client_rpc(struct lu_client_seq *seq,
                          struct lu_seq_range *output, __u32 opc,
                          const char *opcname)
{
	......
		if (seq->lcs_type == LUSTRE_SEQ_METADATA) {
			req->rq_reply_portal = MDC_REPLY_PORTAL;
			req->rq_request_portal = SEQ_METADATA_PORTAL;
		} else {
			req->rq_reply_portal = OSC_REPLY_PORTAL;
			req->rq_request_portal = SEQ_DATA_PORTAL;
		}
	......
	rc = ptlrpc_queue_wait(req);
    if (rc)
		GOTO(out_req, rc);

	//获取到新申请的seq
	out = req_capsule_server_get(&req->rq_pill, &RMF_SEQ_RANGE);
	*output = *out;
	......
}

seq server对应处理函数:

static int seq_handler(struct tgt_session_info *tsi)
{
	struct lu_seq_range	*out, *tmp;
	struct lu_site		*site;
	int			 rc;
	__u32			*opc;

	ENTRY;

	LASSERT(!(lustre_msg_get_flags(tgt_ses_req(tsi)->rq_reqmsg) & MSG_REPLAY));
	site = tsi->tsi_exp->exp_obd->obd_lu_dev->ld_site;
	LASSERT(site != NULL);

	opc = req_capsule_client_get(tsi->tsi_pill, &RMF_SEQ_OPC);
	if (opc != NULL) {
		out = req_capsule_server_get(tsi->tsi_pill, &RMF_SEQ_RANGE);
		if (out == NULL)
			RETURN(err_serious(-EPROTO));

		tmp = req_capsule_client_get(tsi->tsi_pill, &RMF_SEQ_RANGE);

		/* seq client passed mdt id, we need to pass that using out
		 * range parameter */

		out->lsr_index = tmp->lsr_index;
		out->lsr_flags = tmp->lsr_flags;
    	//走这个函数申请新seq
		rc = seq_server_handle(site, tsi->tsi_env, *opc, out);
	} else {
		rc = err_serious(-EPROTO);
	}

	RETURN(rc);
}

static int seq_server_handle(struct lu_site *site,
                             const struct lu_env *env,
                             __u32 opc, struct lu_seq_range *out)
{

	switch (opc) {
	case SEQ_ALLOC_META:
		if (!ss_site->ss_server_seq) {
			CERROR("Sequence server is not "
			       "initialized\n");
			RETURN(-EINVAL);
		}

		dev = lu2dt_dev(ss_site->ss_server_seq->lss_obj->do_lu.lo_dev);
		if (dev->dd_rdonly)
			RETURN(-EROFS);

		rc = seq_server_alloc_meta(ss_site->ss_server_seq, out, env);
        break;
        ......
}

int seq_server_alloc_meta(struct lu_server_seq *seq,
                          struct lu_seq_range *out,
                          const struct lu_env *env)
{
	......
	mutex_lock(&seq->lss_mutex);
        rc = __seq_server_alloc_meta(seq, out, env);
	mutex_unlock(&seq->lss_mutex);
    RETURN(rc);
}

static int __seq_server_alloc_meta(struct lu_server_seq *seq,
				   struct lu_seq_range *out,
				   const struct lu_env *env)
{
	struct lu_seq_range *space = &seq->lss_space;
	int rc = 0;
	......
     // 检查server中seq是否够用,不够用会向controller新申请
	rc = seq_server_check_and_alloc_super(env, seq){
      	if 耗尽{
        	// 向mdt0申请新的可用的sequence range
            rc = seq_client_alloc_super(seq->lss_cli, env);
            // 插入到本地的sequence server的fld
            rc = fld_insert_entry(env, fld, space);
        }
	}
	......
	// 更新server中的seq、将seq server持久化到ldiskfs、赋值给out
	rc = range_alloc_set(env, out, seq);
	......
}

seq svr <–> seq controller

当seq server中seq不足时,会向seq controller申请新的seq
seq server :

int seq_server_check_and_alloc_super(const struct lu_env *env,
				     struct lu_server_seq *seq)
{
	struct lu_seq_range *space = &seq->lss_space;
	int rc = 0;

	ENTRY;

	/* Check if available space ends and allocate new super seq */
	if (lu_seq_range_is_exhausted(space)) {
		// 向mdt0申请seq
		rc = seq_client_alloc_super(seq->lss_cli, env);
		if (rc) {
			CDEBUG(D_HA, "%s: Can't allocate super-sequence:"
			      " rc %d\n", seq->lss_name, rc);
			RETURN(rc);
		}

		/* Saving new range to allocation space. */
		*space = seq->lss_cli->lcs_space;
		LASSERT(lu_seq_range_is_sane(space));
		if (seq->lss_cli->lcs_srv == NULL) {
			struct lu_server_fld *fld;

			/* Insert it to the local FLDB */
			fld = seq->lss_site->ss_server_fld;
			mutex_lock(&fld->lsf_lock);

            // 将申请好的seq插入到本地的sequence server的fld
			rc = fld_insert_entry(env, fld, space);
			mutex_unlock(&fld->lsf_lock);
		}
	}

	if (lu_seq_range_is_zero(&seq->lss_lowater_set))
		__seq_set_init(env, seq);

	RETURN(rc);
}

controller:

static int seq_server_handle(struct lu_site *site,
                             const struct lu_env *env,
                             __u32 opc, struct lu_seq_range *out)
{
	int rc;
	struct seq_server_site *ss_site;
	struct dt_device *dev;
	ENTRY;

	ss_site = lu_site2seq(site);

	switch (opc) {
	case SEQ_ALLOC_META:
		......
        break;
	case SEQ_ALLOC_SUPER:
		if (!ss_site->ss_control_seq) {
			CERROR("Sequence controller is not "
			       "initialized\n");
			RETURN(-EINVAL);
		}

		dev = lu2dt_dev(ss_site->ss_control_seq->lss_obj->do_lu.lo_dev);
		if (dev->dd_rdonly)
			RETURN(-EROFS);

		rc = seq_server_alloc_super(ss_site->ss_control_seq, out, env);
		break;
	default:
		rc = -EINVAL;
		break;
	}

	RETURN(rc);
}

int seq_server_alloc_super(struct lu_server_seq *seq,
                           struct lu_seq_range *out,
                           const struct lu_env *env)
{
        int rc;
        ENTRY;

	mutex_lock(&seq->lss_mutex);
        rc = __seq_server_alloc_super(seq, out, env);
	mutex_unlock(&seq->lss_mutex);

        RETURN(rc);
}

static int __seq_server_alloc_super(struct lu_server_seq *seq,
                                    struct lu_seq_range *out,
                                    const struct lu_env *env)
{
	struct lu_seq_range *space = &seq->lss_space;
	int rc;
	ENTRY;

	LASSERT(lu_seq_range_is_sane(space));

	if (lu_seq_range_is_exhausted(space)) {
		CERROR("%s: Sequences space is exhausted\n",
		       seq->lss_name);
		RETURN(-ENOSPC);
	} else {
    	//在mdt0内分配seq
		range_alloc(out, space, seq->lss_width);
	}
	
    //将新申请的seq更新到fld
	rc = seq_store_update(env, seq, out, 1 /* sync */);

	LCONSOLE_INFO("%s: super-sequence allocation rc = %d " DRANGE"\n",
		      seq->lss_name, rc, PRANGE(out));

	RETURN(rc);
}

上述理解如果有理解不正确的地方,欢迎各位大佬指正[手动抱拳]

参考了一位大佬的文章,链接如下:https://cloud.tencent.com/developer/article/2074601

  • 9
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值