0814补充:librados 和 Libcephfs的详细调用路径
总体对比图
librados 接口入参直接接受对象信息,而libcephfs 接口入参需要经过file_to_extents转换,把文件的偏移信息转换为底层对象信息,这里rbd和cephfs 两者都需要经过这个转换。
1 librados使用流程
1.1 rados初始化流程
rados需要经过rados_create2 来创建cct和RadosClient,再经过rados_connect和集群连接。
// src/include/rados/librados.h
CEPH_RADOS_API int rados_create2(rados_t *pcluster,
const char *const clustername,
const char * const name, uint64_t flags);
// src/rados/librados_c.cc 初始化cct和 Rados Client
extern "C" int LIBRADOS_C_API_DEFAULT_F(rados_create2)(
rados_t *pcluster,
const char *const clustername,
const char * const name,
uint64_t flags)
{
// client is assumed, but from_str will override
int retval = 0;
CephInitParameters iparams(CEPH_ENTITY_TYPE_CLIENT);
if (!name || !iparams.name.from_str(name)) {
retval = -EINVAL;
}
CephContext *cct = rados_create_cct(clustername, &iparams);//创建集群上下文cct
tracepoint(librados, rados_create2_enter, clustername, name, flags);
if (retval == 0) {
*pcluster = reinterpret_cast<rados_t>(new librados::RadosClient(cct)); //new 一个RadosClient,RadosClient类作为rados封装的上层类。实际具体Io封装在下一层Ioctx类
}
tracepoint(librados, rados_create2_exit, retval, *pcluster);
cct->put();
return retval;
}
//和集群建立链接,需要调用rados_connect
extern "C" int LIBRADOS_C_API_DEFAULT_F(rados_connect)(rados_t cluster)
{
tracepoint(librados, rados_connect_enter, cluster);
librados::RadosClient *client = (librados::RadosClient *)cluster;
int retval = client->connect();
tracepoint(librados, rados_connect_exit, retval);
return retval;
}
LIBRADOS_C_API_BASE_DEFAULT(rados_connect);
radosclient->connect调用RadosClient类中的connect函数,该函数较长,仅截取主要内容来展示,主要功能就是生成messenger,objecter等并完成线程初始化,如log,asok。
int librados::RadosClient::connect()
{
int err;
//创建临时MonClient初始化集群配置。
{
MonClient mc_bootstrap(cct, poolctx);
err = mc_bootstrap.get_monmap_and_config();
if (err < 0)
return err;
}
common_init_finish(cct);//开启日志log模块和asok功能。
//创建消息的顶层管理抽象类 messenger,实际是AsyncMessenger
messenger = Messenger::create_client_messenger(cct, "radosclient");
//创建objecter类,该模块适用于和osd通信,下发write,read object操作的封装模块。
objecter = new (std::nothrow) Objecter(cct, messenger, &monclient, poolctx);
monclient.set_messenger(messenger);
mgrclient.set_messenger(messenger);
objecter->init();
messenger->add_dispatcher_head(&mgrclient);
messenger->add_dispatcher_tail(objecter);
messenger->add_dispatcher_tail(this);//添加消息的dispatcher,分别是mgrclient,objecter,还有radosclient自身,还有个monclient
messenger->start();//开启消息模块
..........剩下都是一些初始化操作,详情自己阅读代码。
return err;
}
1.2 rados创建Io封装类Ioctx
int librados::RadosClient::create_ioctx(const char *name, IoCtxImpl **io)
{
int64_t poolid = lookup_pool(name);
if (poolid < 0) {
return (int)poolid;
}
*io = new librados::IoCtxImpl(this, objecter, poolid, CEPH_NOSNAP);
return 0;
}
librados::IoCtxImpl::IoCtxImpl(RadosClient *c, Objecter *objecter,
int64_t poolid, snapid_t s)
: client(c), poolid(poolid), snap_seq(s),
notify_timeout(c->cct->_conf->client_notify_timeout),
oloc(poolid),
aio_write_seq(0), objecter(objecter)
{
}
//Ioctx和具体的存储池绑定,使用该Ioctx类封装的函数即可在该存储池内操作对象。包括对象的创建、删除、修改
1.3 rados_write
这里可先阅读4 ObjectOperation类的构成和封装逻辑。
extern "C" int LIBRADOS_C_API_DEFAULT_F(rados_write)(
rados_ioctx_t io,
const char *o,
const char *buf,
size_t len,
uint64_t off) //io就是ioctx,o是对象名,buf是写入到数据,len是长度,off是偏移
{
tracepoint(librados, rados_write_enter, io, o, buf, len, off);
if (len > UINT_MAX/2)
return -E2BIG;
librados::IoCtxImpl *ctx = (librados::IoCtxImpl *)io;
object_t oid(o);
bufferlist bl;
bl.append(buf, len);
int retval = ctx->write(oid, bl, len, off);
tracepoint(librados, rados_write_exit, retval);
return retval;
}
int librados::IoCtxImpl::write(const object_t& oid, bufferlist& bl,
size_t len, uint64_t off)
{
if (len > UINT_MAX/2)
return -E2BIG;
::ObjectOperation op;
prepare_assert_ops(&op);
bufferlist mybl;
mybl.substr_of(bl, 0, len);
op.write(off, mybl); //根据不同的操作字封装不同的op,包括write,setxattr,read等。
return operate(oid, &op, NULL);
}
2 rados调用objecter
下图为rados最终的操作,调用objecter发给osd
src/librados/IoctxImpl.cc
int librados::IoCtxImpl::operate(const object_t& oid, ::ObjectOperation *o,
ceph::real_time *pmtime, int flags, const jspan_context* otel_trace)
{
ceph::real_time ut = (pmtime ? *pmtime :
ceph::real_clock::now());
/* can't write to a snapshot */
if (snap_seq != CEPH_NOSNAP)
return -EROFS;
if (!o->size())
return 0;
ceph::mutex mylock = ceph::make_mutex("IoCtxImpl::operate::mylock");
ceph::condition_variable cond;
bool done;
int r;
version_t ver;
Context *oncommit = new C_SafeCond(mylock, cond, &done, &r);
int op = o->ops[0].op.op;
ldout(client->cct, 10) << ceph_osd_op_name(op) << " oid=" << oid
<< " nspace=" << oloc.nspace << dendl;
Objecter::Op *objecter_op = objecter->prepare_mutate_op( //最终的write都是封装Op以后由objecter模块发送给Osd处理。
oid, oloc,
*o, snapc, ut,
flags | extra_op_flags,
oncommit, &ver, osd_reqid_t(), nullptr, otel_trace);
objecter->op_submit(objecter_op);
{
std::unique_lock l{mylock};
cond.wait(l, [&done] { return done;});
}
ldout(client->cct, 10) << "Objecter returned from "
<< ceph_osd_op_name(op) << " r=" << r << dendl;
set_sync_op_version(ver);
return r;
}
3 libcephf调用objecter
3.1 libcephfs 初始化
调用ceph_create + ceph_mount,其实内部实现和radosclient初始化构造的基本一致。
src/libcephfs.cc
src/include/libcephfs.h
extern "C" int ceph_create(struct ceph_mount_info **cmount, const char * const id)
{
CephInitParameters iparams(CEPH_ENTITY_TYPE_CLIENT);
if (id) {
iparams.name.set(CEPH_ENTITY_TYPE_CLIENT, id);
}
CephContext *cct = common_preinit(iparams, CODE_ENVIRONMENT_LIBRARY, 0);
cct->_conf.parse_env(cct->get_module_type()); // environment variables coverride
cct->_conf.apply_changes(nullptr);
int ret = ceph_create_with_context(cmount, cct);
cct->put();
cct = nullptr;
return ret;
}
extern "C" int ceph_init(struct ceph_mount_info *cmount)
{
return cmount->init();
}
extern "C" int ceph_mount(struct ceph_mount_info *cmount, const char *root)
{
std::string mount_root;
if (root)
mount_root = root;
return cmount->mount(mount_root, cmount->default_perms);
}
//这里因为文件场景需要设置挂载目录,mount就是给mds发getattr消息拿到root信息并挂载,设置client->root为正确的ino
3.2 libcephfs write流程
//src/libcephfs.cc
extern "C" int ceph_ll_write(class ceph_mount_info *cmount,
Fh *fh, int64_t off, uint64_t len,
const char *data)
{
return (cmount->get_client()->ll_write(fh, off, len, data));
}
//src/client/Client.cc
int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
{
/* We can't return bytes written larger than INT_MAX, clamp len to that */
len = std::min(len, (loff_t)INT_MAX);
std::scoped_lock lock(client_lock);
int r = _write(fh, off, len, data, NULL, 0);
ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
<< dendl;
return r;
}
int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
const struct iovec *iov, int iovcnt, Context *onfinish,
bool do_fsync, bool syncdataonly)
{
//文件场景在写之前牵扯大量的元数据信息更新和处理,这里略过,直接进入关键函数
// async, caching, non-blocking.
r = objectcacher->file_write(&in->oset, &in->layout,
in->snaprealm->get_snap_context(),
offset, size, bl, ceph::real_clock::now(),
0, iofinish.get(),
onfinish == nullptr
? objectcacher->CFG_block_writes_upfront()
: false); //走缓存
//**********
filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
offset, size, bl, ceph::real_clock::now(), 0,
in->truncate_size, in->truncate_seq,
iofinish.get()); //不走缓存
}
3.3 调用objecter
下图为cephfs最终的下发给osd,调用objecter的接口
src/osdc/objecter.h
ceph_tid_t write_trunc(const object_t& oid, const object_locator_t& oloc,
uint64_t off, uint64_t len, const SnapContext& snapc,
const ceph::buffer::list &bl, ceph::real_time mtime, int flags,
uint64_t trunc_size, __u32 trunc_seq,
Context *oncommit,
version_t *objver = NULL,
ObjectOperation *extra_ops = NULL, int op_flags = 0) {
osdc_opvec ops;
int i = init_ops(ops, 1, extra_ops);
ops[i].op.op = CEPH_OSD_OP_WRITE;
ops[i].op.extent.offset = off;
ops[i].op.extent.length = len;
ops[i].op.extent.truncate_size = trunc_size;
ops[i].op.extent.truncate_seq = trunc_seq;
ops[i].indata = bl;
ops[i].op.flags = op_flags;
Op *o = new Op(oid, oloc, std::move(ops), flags | global_op_flags |
CEPH_OSD_FLAG_WRITE, oncommit, objver);
o->mtime = mtime;
o->snapc = snapc;
ceph_tid_t tid;
op_submit(o, &tid);
return tid;
}
4 ObjectOperation类
类objectoperation 封装了上边代码中ops的添加删除等相关操作。
struct ObjectOperation {
osdc_opvec ops;
int flags = 0;
int priority = 0;
boost::container::small_vector<ceph::buffer::list*, osdc_opvec_len> out_bl;
boost::container::small_vector<
fu2::unique_function<void(boost::system::error_code, int,
const ceph::buffer::list& bl) &&>,
osdc_opvec_len> out_handler;
boost::container::small_vector<int*, osdc_opvec_len> out_rval;
boost::container::small_vector<boost::system::error_code*,
osdc_opvec_len> out_ec;
ObjectOperation() = default;
ObjectOperation(const ObjectOperation&) = delete;
ObjectOperation& operator =(const ObjectOperation&) = delete;
ObjectOperation(ObjectOperation&&) = default;
ObjectOperation& operator =(ObjectOperation&&) = default;
~ObjectOperation() = default;
size_t size() const {
return ops.size();
}
OSDOp& add_op(int op) {
ops.emplace_back();
ops.back().op.op = op;
out_bl.push_back(nullptr);
ceph_assert(ops.size() == out_bl.size());
out_handler.emplace_back();
ceph_assert(ops.size() == out_handler.size());
out_rval.push_back(nullptr);
ceph_assert(ops.size() == out_rval.size());
out_ec.push_back(nullptr);
ceph_assert(ops.size() == out_ec.size());
return ops.back();
}
void add_data(int op, uint64_t off, uint64_t len, ceph::buffer::list& bl) {
OSDOp& osd_op = add_op(op);
osd_op.op.extent.offset = off;
osd_op.op.extent.length = len;
osd_op.indata.claim_append(bl);
}
int librados::IoCtxImpl::write(const object_t& oid, bufferlist& bl,
size_t len, uint64_t off)
{
if (len > UINT_MAX/2)
return -E2BIG;
::ObjectOperation op;
prepare_assert_ops(&op);
bufferlist mybl;
mybl.substr_of(bl, 0, len);
op.write(off, mybl);
return operate(oid, &op, NULL);
}
void write(uint64_t off, ceph::buffer::list& bl,
uint64_t truncate_size,
uint32_t truncate_seq) {
add_data(CEPH_OSD_OP_WRITE, off, bl.length(), bl);
OSDOp& o = *ops.rbegin();
o.op.extent.truncate_size = truncate_size;
o.op.extent.truncate_seq = truncate_seq;
}