调用栈
void PrimaryLogPG::execute_ctx(OpContext *ctx)
|--ctx->register_on_commit
|--ctx->register_on_success
|--ctx->register_on_finish
|--RepGather *repop = new_repop(ctx, obc, rep_tid);//增加到一个repop_queue
|--issue_repop(repop, ctx);
|--ReplicatedBackend::submit_transaction
|--ReplicatedBackend::issue_op//发副本消息是在这里面做的
|--eval_repop(repop);
register_on_commit注册了回客户端消息的函数
if (m && !ctx->sent_reply)
{
MOSDOpReply *reply = ctx->reply;
if (reply)
ctx->reply = nullptr;
else
{
reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, true);
reply->set_reply_versions(ctx->at_version,
ctx->user_at_version);
}
reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
dout(10) << " sending reply on " << *m << " " << reply << dendl;
osd->send_message_osd_client(reply, m->get_connection());
ctx->sent_reply = true;
ctx->op->mark_commit_sent();
}
RepGather *repop = new_repop(ctx, obc, rep_tid),上面注册的三个回调函数都传入到了这个类的实例。
ReplicatedBackend::submit_transaction
auto insert_res = in_progress_ops.insert(make_pair(tid, new InProgressOp(tid, on_all_commit, orig_op , at_version)));
/* 如果是副本,插入副本个数的item */
op.waiting_for_commit.insert
parent->log_operation
/* 本地存储引擎commit以后的回调 */
op_t.register_on_commit(parent->bless_context(new C_OSD_OnOpCommit(this, &op)));
从waiting_for_commit的变化就可以看出回调的流程,那waiting_for_commit是如何变化的呢。
对于副本pool的写来说,从submit_transaction这里可以看到有插入副本个数的item到waiting_for_commit, 而erase的地方有两处:
-
erase第一点是在ReplicatedBackend::do_repop_reply函数中,这个是接收到客户端消息的处理函,也就是从osd写副本后 ,主osd收到从osd的相应,只有在CEPH_OSD_FLAG_ONDISK的时候才会erase
void ReplicatedBackend::do_repop_reply(OpRequestRef op) { static_cast<MOSDRepOpReply *>(op->get_nonconst_req())->finish_decode(); const MOSDRepOpReply *r = static_cast<const MOSDRepOpReply *>(op->get_req()); ceph_assert(r->get_header().type == MSG_OSD_REPOPREPLY); op->mark_started(); // must be replication. ceph_tid_t rep_tid = r->get_tid(); pg_shard_t from = r->from; auto iter = in_progress_ops.find(rep_tid); if (iter != in_progress_ops.end()) { InProgressOp &ip_op = *iter->second; const MOSDOp *m = NULL; if (ip_op.op) m = static_cast<const MOSDOp *>(ip_op.op->get_req()); ... // oh, good. if (r->ack_type & CEPH_OSD_FLAG_ONDISK) { ceph_assert(ip_op.waiting_for_commit.count(from)); /* erase调用之一 */ ip_op.waiting_for_commit.erase(from); if (ip_op.op) { ip_op.op->mark_event("sub_op_commit_rec"); ip_op.op->pg_trace.event("sub_op_commit_rec"); } } else { // legacy peer; ignore } parent->update_peer_last_complete_ondisk( from, r->get_last_complete_ondisk()); /* 为空的时候会回调 */ if (ip_op.waiting_for_commit.empty() && ip_op.on_commit) { ip_op.on_commit->complete(0); ip_op.on_commit = 0; in_progress_ops.erase(iter); } } }
-
erase第二点是在C_OSD_OnOpCommit::finish函数调用ReplicatedBackend::op_commit,即本地写存储引擎返回后。
void ReplicatedBackend::op_commit(
InProgressOpRef &op)
{
if (op->on_commit == nullptr)
{
// aborted
return;
}
FUNCTRACE(cct);
OID_EVENT_TRACE_WITH_MSG((op && op->op) ? op->op->get_req() : NULL, "OP_COMMIT_BEGIN", true);
dout(10) << __func__ << ": " << op->tid << dendl;
if (op->op)
{
op->op->mark_event("op_commit");
op->op->pg_trace.event("op commit");
}
op->waiting_for_commit.erase(get_parent()->whoami_shard());
/* 为空的时候会回调,也就是所有副本和主osd的请求都commit了 */
if (op->waiting_for_commit.empty())
{
op->on_commit->complete(0);
op->on_commit = 0;
in_progress_ops.erase(op->tid);
}
}
到这里我们可以看出副本osd先处理完,还是本地osd先处理完,这个都是随机的。
而且从上面两个函数看出waiting_for_commit为空的时候会进行回调,回调如下:
class C_OSD_RepopCommit : public Context
{
PrimaryLogPGRef pg;
boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
public:
C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
: pg(pg), repop(repop) {}
void finish(int) override
{
pg->repop_all_committed(repop.get());
}
};
C_OSD_RepopCommit是PrimaryLogPG::issue_repop的时候注册的,在finish中回调以下函数:
// 调用
void PrimaryLogPG::repop_all_committed(RepGather *repop)
{
dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
<< dendl;
repop->all_committed = true; //设置为true
if (!repop->rep_aborted)
{
if (repop->v != eversion_t())
{
last_update_ondisk = repop->v;
last_complete_ondisk = repop->pg_local_last_complete;
}
eval_repop(repop);
}
}
上面代码块调用了void PrimaryLogPG::eval_repop(RepGather *repop)
void PrimaryLogPG::eval_repop(RepGather *repop)
{
const MOSDOp *m = NULL;
if (repop->op)
m = static_cast<const MOSDOp *>(repop->op->get_req());
// ondisk?
/* PrimaryLogPG::repop_all_committed中如果满足条件已经设置成了true */
if (repop->all_committed)
{
dout(10) << " commit: " << *repop << dendl;
for (auto p = repop->on_committed.begin();
p != repop->on_committed.end();
repop->on_committed.erase(p++))
{
/* 回到最上层注册的回调,即
ctx->register_on_commit
*/
(*p)();
}
// send dup commits, in order
auto it = waiting_for_ondisk.find(repop->v);
if (it != waiting_for_ondisk.end())
{
ceph_assert(waiting_for_ondisk.begin()->first == repop->v);
for (auto &i : it->second)
{
int return_code = repop->r;
if (return_code >= 0)
{
return_code = std::get<2>(i);
}
osd->reply_op_error(std::get<0>(i), return_code, repop->v,
std::get<1>(i));
}
waiting_for_ondisk.erase(it);
}
publish_stats_to_osd();
calc_min_last_complete_ondisk();
dout(10) << " removing " << *repop << dendl;
ceph_assert(!repop_queue.empty());
dout(20) << " q front is " << *repop_queue.front() << dendl;
①if (repop_queue.front() == repop)
{
RepGather *to_remove = nullptr;
while (!repop_queue.empty() &&
(to_remove = repop_queue.front())->all_committed)
{
repop_queue.pop_front();
for (auto p = to_remove->on_success.begin();
p != to_remove->on_success.end();
to_remove->on_success.erase(p++))
{
/* 回调onsuccess */
(*p)();
}
remove_repop(to_remove);
remove_repop实现>>>>>>>
for (auto p = repop->on_finish.begin(); p != repop->on_finish.end();
repop->on_finish.erase(p++))
{
/* 回调onfinish */
(*p)();
}
/* 释放读写锁 */
release_object_locks(repop->lock_manager);
>>>>>>>>>
}
}
}
}
该函数最终回调了PrimaryLogPG::execute_ctx注册的三个函数。
-
oncommit: 回客户端消息
-
onsuccess: watch相关的操作
-
onfinish:释放上下文内存
从代码①处的逻辑可以看出,repop_queue起到了保序的作用,即不是每个IO三副本提交了以后马上就回调,而是按照先入先出的原则,如果一个IO不在队列头,不回调,后来的IO如果在队列头,会驱动前面已经all_committed的IO进行回调。
最终在release_object_locks中释放了读写锁。