Ceph中 librados和libcephfs异同

0814补充:librados 和 Libcephfs的详细调用路径

总体对比图

librados 接口入参直接接受对象信息,而libcephfs 接口入参需要经过file_to_extents转换,把文件的偏移信息转换为底层对象信息,这里rbd和cephfs 两者都需要经过这个转换。

1 librados使用流程
1.1 rados初始化流程

  rados需要经过rados_create2 来创建cct和RadosClient,再经过rados_connect和集群连接。

// src/include/rados/librados.h
CEPH_RADOS_API int rados_create2(rados_t *pcluster,
                                 const char *const clustername,
                                 const char * const name, uint64_t flags);

// src/rados/librados_c.cc  初始化cct和 Rados Client
extern "C" int LIBRADOS_C_API_DEFAULT_F(rados_create2)(
  rados_t *pcluster,
  const char *const clustername,
  const char * const name,
  uint64_t flags)
{
  // client is assumed, but from_str will override
  int retval = 0;
  CephInitParameters iparams(CEPH_ENTITY_TYPE_CLIENT);
  if (!name || !iparams.name.from_str(name)) {
    retval = -EINVAL;
  }

  CephContext *cct = rados_create_cct(clustername, &iparams);//创建集群上下文cct
  tracepoint(librados, rados_create2_enter, clustername, name, flags);
  if (retval == 0) {
    *pcluster = reinterpret_cast<rados_t>(new librados::RadosClient(cct)); //new 一个RadosClient,RadosClient类作为rados封装的上层类。实际具体Io封装在下一层Ioctx类
  }
  tracepoint(librados, rados_create2_exit, retval, *pcluster);

  cct->put();
  return retval;
}

//和集群建立链接,需要调用rados_connect
extern "C" int LIBRADOS_C_API_DEFAULT_F(rados_connect)(rados_t cluster)
{
  tracepoint(librados, rados_connect_enter, cluster);
  librados::RadosClient *client = (librados::RadosClient *)cluster;
  int retval = client->connect();
  tracepoint(librados, rados_connect_exit, retval);
  return retval;
}
LIBRADOS_C_API_BASE_DEFAULT(rados_connect);

 radosclient->connect调用RadosClient类中的connect函数,该函数较长,仅截取主要内容来展示,主要功能就是生成messenger,objecter等并完成线程初始化,如log,asok。

int librados::RadosClient::connect()
{
  int err;

  //创建临时MonClient初始化集群配置。

  {
    MonClient mc_bootstrap(cct, poolctx);
    err = mc_bootstrap.get_monmap_and_config();
    if (err < 0)
      return err;
  }

  common_init_finish(cct);//开启日志log模块和asok功能。

  //创建消息的顶层管理抽象类   messenger,实际是AsyncMessenger
  messenger = Messenger::create_client_messenger(cct, "radosclient");

 
  //创建objecter类,该模块适用于和osd通信,下发write,read object操作的封装模块。
  objecter = new (std::nothrow) Objecter(cct, messenger, &monclient, poolctx);
 

  monclient.set_messenger(messenger);
  mgrclient.set_messenger(messenger);

  objecter->init();
  messenger->add_dispatcher_head(&mgrclient);
  messenger->add_dispatcher_tail(objecter);
  messenger->add_dispatcher_tail(this);//添加消息的dispatcher,分别是mgrclient,objecter,还有radosclient自身,还有个monclient

  messenger->start();//开启消息模块 

  ..........剩下都是一些初始化操作,详情自己阅读代码。

  return err;
}
 1.2 rados创建Io封装类Ioctx

int librados::RadosClient::create_ioctx(const char *name, IoCtxImpl **io)
{
  int64_t poolid = lookup_pool(name);
  if (poolid < 0) {
    return (int)poolid;
  }

  *io = new librados::IoCtxImpl(this, objecter, poolid, CEPH_NOSNAP);
  return 0;
}

librados::IoCtxImpl::IoCtxImpl(RadosClient *c, Objecter *objecter,
			       int64_t poolid, snapid_t s)
  : client(c), poolid(poolid), snap_seq(s),
    notify_timeout(c->cct->_conf->client_notify_timeout),
    oloc(poolid),
    aio_write_seq(0), objecter(objecter)
{
}

//Ioctx和具体的存储池绑定,使用该Ioctx类封装的函数即可在该存储池内操作对象。包括对象的创建、删除、修改
1.3 rados_write

   这里可先阅读4 ObjectOperation类的构成和封装逻辑。

extern "C" int LIBRADOS_C_API_DEFAULT_F(rados_write)(
  rados_ioctx_t io,
  const char *o,
  const char *buf,
  size_t len,
  uint64_t off) //io就是ioctx,o是对象名,buf是写入到数据,len是长度,off是偏移
{
  tracepoint(librados, rados_write_enter, io, o, buf, len, off);
  if (len > UINT_MAX/2)
    return -E2BIG;
  librados::IoCtxImpl *ctx = (librados::IoCtxImpl *)io;
  object_t oid(o);
  bufferlist bl;
  bl.append(buf, len);
  int retval = ctx->write(oid, bl, len, off);
  tracepoint(librados, rados_write_exit, retval);
  return retval;
}

int librados::IoCtxImpl::write(const object_t& oid, bufferlist& bl,
			       size_t len, uint64_t off)
{
  if (len > UINT_MAX/2)
    return -E2BIG;
  ::ObjectOperation op;  
  prepare_assert_ops(&op);
  bufferlist mybl;
  mybl.substr_of(bl, 0, len);
  op.write(off, mybl);   //根据不同的操作字封装不同的op,包括write,setxattr,read等。
  return operate(oid, &op, NULL);
}

2 rados调用objecter

下图为rados最终的操作,调用objecter发给osd

src/librados/IoctxImpl.cc

int librados::IoCtxImpl::operate(const object_t& oid, ::ObjectOperation *o,
				 ceph::real_time *pmtime, int flags, const jspan_context* otel_trace)
{
  ceph::real_time ut = (pmtime ? *pmtime :
    ceph::real_clock::now());

  /* can't write to a snapshot */
  if (snap_seq != CEPH_NOSNAP)
    return -EROFS;

  if (!o->size())
    return 0;

  ceph::mutex mylock = ceph::make_mutex("IoCtxImpl::operate::mylock");
  ceph::condition_variable cond;
  bool done;
  int r;
  version_t ver;

  Context *oncommit = new C_SafeCond(mylock, cond, &done, &r);

  int op = o->ops[0].op.op;
  ldout(client->cct, 10) << ceph_osd_op_name(op) << " oid=" << oid
			 << " nspace=" << oloc.nspace << dendl;
  Objecter::Op *objecter_op = objecter->prepare_mutate_op(  //最终的write都是封装Op以后由objecter模块发送给Osd处理。
    oid, oloc,
    *o, snapc, ut,
    flags | extra_op_flags,
    oncommit, &ver, osd_reqid_t(), nullptr, otel_trace);
  objecter->op_submit(objecter_op);

  {
    std::unique_lock l{mylock};
    cond.wait(l, [&done] { return done;});
  }
  ldout(client->cct, 10) << "Objecter returned from "
	<< ceph_osd_op_name(op) << " r=" << r << dendl;

  set_sync_op_version(ver);

  return r;
}
3 libcephf调用objecter
3.1 libcephfs 初始化

 调用ceph_create + ceph_mount,其实内部实现和radosclient初始化构造的基本一致。

src/libcephfs.cc
src/include/libcephfs.h
extern "C" int ceph_create(struct ceph_mount_info **cmount, const char * const id)
{
  CephInitParameters iparams(CEPH_ENTITY_TYPE_CLIENT);
  if (id) {
    iparams.name.set(CEPH_ENTITY_TYPE_CLIENT, id);
  }

  CephContext *cct = common_preinit(iparams, CODE_ENVIRONMENT_LIBRARY, 0);
  cct->_conf.parse_env(cct->get_module_type()); // environment variables coverride
  cct->_conf.apply_changes(nullptr);
  int ret = ceph_create_with_context(cmount, cct);
  cct->put();
  cct = nullptr;
  return ret;
}
extern "C" int ceph_init(struct ceph_mount_info *cmount)
{
  return cmount->init();
}

extern "C" int ceph_mount(struct ceph_mount_info *cmount, const char *root)
{
  std::string mount_root;
  if (root)
    mount_root = root;
  return cmount->mount(mount_root, cmount->default_perms);
}

//这里因为文件场景需要设置挂载目录,mount就是给mds发getattr消息拿到root信息并挂载,设置client->root为正确的ino
3.2 libcephfs write流程 
//src/libcephfs.cc
extern "C" int ceph_ll_write(class ceph_mount_info *cmount,
			     Fh *fh, int64_t off, uint64_t len,
			     const char *data)
{
  return (cmount->get_client()->ll_write(fh, off, len, data));
}
//src/client/Client.cc
int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
{


  /* We can't return bytes written larger than INT_MAX, clamp len to that */
  len = std::min(len, (loff_t)INT_MAX);
  std::scoped_lock lock(client_lock);

  int r = _write(fh, off, len, data, NULL, 0);
  ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
		<< dendl;
  return r;
}

int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
	                const struct iovec *iov, int iovcnt, Context *onfinish,
	                bool do_fsync, bool syncdataonly)
{

    //文件场景在写之前牵扯大量的元数据信息更新和处理,这里略过,直接进入关键函数


      // async, caching, non-blocking.
    r = objectcacher->file_write(&in->oset, &in->layout,
				 in->snaprealm->get_snap_context(),
				 offset, size, bl, ceph::real_clock::now(),
				 0, iofinish.get(),
				 onfinish == nullptr
				   ? objectcacher->CFG_block_writes_upfront()
				   : false);  //走缓存

    //**********
    
    filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
		       offset, size, bl, ceph::real_clock::now(), 0,
		       in->truncate_size, in->truncate_seq,
		       iofinish.get()); //不走缓存
}
3.3 调用objecter

下图为cephfs最终的下发给osd,调用objecter的接口

src/osdc/objecter.h

  ceph_tid_t write_trunc(const object_t& oid, const object_locator_t& oloc,
			 uint64_t off, uint64_t len, const SnapContext& snapc,
			 const ceph::buffer::list &bl, ceph::real_time mtime, int flags,
			 uint64_t trunc_size, __u32 trunc_seq,
			 Context *oncommit,
			 version_t *objver = NULL,
			 ObjectOperation *extra_ops = NULL, int op_flags = 0) {
    osdc_opvec ops;
    int i = init_ops(ops, 1, extra_ops);
    ops[i].op.op = CEPH_OSD_OP_WRITE;
    ops[i].op.extent.offset = off;
    ops[i].op.extent.length = len;
    ops[i].op.extent.truncate_size = trunc_size;
    ops[i].op.extent.truncate_seq = trunc_seq;
    ops[i].indata = bl;
    ops[i].op.flags = op_flags;
    Op *o = new Op(oid, oloc, std::move(ops), flags | global_op_flags |
		   CEPH_OSD_FLAG_WRITE, oncommit, objver);
    o->mtime = mtime;
    o->snapc = snapc;
    ceph_tid_t tid;
    op_submit(o, &tid);
    return tid;
  }
4 ObjectOperation类

类objectoperation  封装了上边代码中ops的添加删除等相关操作。

struct ObjectOperation {
  osdc_opvec ops;
  int flags = 0;
  int priority = 0;

  boost::container::small_vector<ceph::buffer::list*, osdc_opvec_len> out_bl;
  boost::container::small_vector<
    fu2::unique_function<void(boost::system::error_code, int,
			      const ceph::buffer::list& bl) &&>,
    osdc_opvec_len> out_handler;
  boost::container::small_vector<int*, osdc_opvec_len> out_rval;
  boost::container::small_vector<boost::system::error_code*,
				 osdc_opvec_len> out_ec;

  ObjectOperation() = default;
  ObjectOperation(const ObjectOperation&) = delete;
  ObjectOperation& operator =(const ObjectOperation&) = delete;
  ObjectOperation(ObjectOperation&&) = default;
  ObjectOperation& operator =(ObjectOperation&&) = default;
  ~ObjectOperation() = default;

  size_t size() const {
    return ops.size();
  }  

OSDOp& add_op(int op) {
    ops.emplace_back();
    ops.back().op.op = op;
    out_bl.push_back(nullptr);
    ceph_assert(ops.size() == out_bl.size());
    out_handler.emplace_back();
    ceph_assert(ops.size() == out_handler.size());
    out_rval.push_back(nullptr);
    ceph_assert(ops.size() == out_rval.size());
    out_ec.push_back(nullptr);
    ceph_assert(ops.size() == out_ec.size());
    return ops.back();
  }
  void add_data(int op, uint64_t off, uint64_t len, ceph::buffer::list& bl) {
    OSDOp& osd_op = add_op(op);
    osd_op.op.extent.offset = off;
    osd_op.op.extent.length = len;
    osd_op.indata.claim_append(bl);
  }

int librados::IoCtxImpl::write(const object_t& oid, bufferlist& bl,
			       size_t len, uint64_t off)
{
  if (len > UINT_MAX/2)
    return -E2BIG;
  ::ObjectOperation op;
  prepare_assert_ops(&op);
  bufferlist mybl;
  mybl.substr_of(bl, 0, len);
  op.write(off, mybl);
  return operate(oid, &op, NULL);
}

  void write(uint64_t off, ceph::buffer::list& bl,
	     uint64_t truncate_size,
	     uint32_t truncate_seq) {
    add_data(CEPH_OSD_OP_WRITE, off, bl.length(), bl);
    OSDOp& o = *ops.rbegin();
    o.op.extent.truncate_size = truncate_size;
    o.op.extent.truncate_seq = truncate_seq;
  }

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值