整个函数的要点如下:
1、 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false); //获取该directory的目录项
2、 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode); //创建该dir的Cinode新节点newi
3、 CDir *newdir = newi->get_or_open_dirfrag(mds->mdcache, frag_t()); //获取或创建一个CDir项newdir
4、将这些改变记入日志MDLog中
首先看下handle_client_mkdir函数内容
// MKDIR
/* This function takes responsibility for the passed mdr*/
void Server::handle_client_mkdir(MDRequestRef& mdr)
{
MClientRequest *req = mdr->client_request;
set<SimpleLock*> rdlocks, wrlocks, xlocks;
CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false); //获取该directory的目录项
if (!dn) return;
if (mdr->snapid != CEPH_NOSNAP) {
respond_to_request(mdr, -EROFS);
return;
}
CInode *diri = dn->get_dir()->get_inode();
rdlocks.insert(&diri->authlock);
if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
return;
// new inode
SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
snapid_t follows = realm->get_newest_seq();
unsigned mode = req->head.args.mkdir.mode;
mode &= ~S_IFMT;
mode |= S_IFDIR;
CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode); //创建该dir的新节点newi
assert(newi);
// it's a directory.
dn->push_projected_linkage(newi);
newi->inode.version = dn->pre_dirty();
newi->inode.rstat.rsubdirs = 1;
newi->inode.update_backtrace();
dout(12) << " follows " << follows << dendl;
if (follows >= dn->first)
dn->first = follows + 1;
newi->first = dn->first;
// ...and that new dir is empty.
CDir *newdir = newi->get_or_open_dirfrag(mds->mdcache, frag_t()); //获取或创建newdir
newdir->mark_complete();
newdir->fnode.version = newdir->pre_dirty();
// prepare finisher
// 以下工作为将上面的改变记入日志MDlog中
mdr->ls = mdlog->get_current_segment();
EUpdate *le = new EUpdate(mdlog, "mkdir"); //为类EUpdate对象le分配空间并记录到mdlog此操作为mkdir,日志记录元数据的操作均以事件的形式记录,如ECommitted、EUpdate等
mdlog->start_entry(le); //找到将要写入日志的起始地址
le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); //将req加入到用于记录客户端请求的pair容器中;metablob类注释为a bunch of metadata in the journal (日志中的一堆元数据)
journal_allocated_inos(mdr, &le->metablob); //为即将写入日志的inode的成员变量赋予对应的值
mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); //为将当前目录的目录项标记为脏,见下边具体代码分析
le->metablob.add_primary_dentry(dn, newi, true, true); //添加主目录(将其与newi节点对应)到链表lump中,该链表记录相关的dentry+inode,并标记此inode was last journaled
le->metablob.add_new_dir(newdir); // dirty AND complete AND new //标记newdir为dirty AND complete AND new
// issue a cap on the directory
int cmode = CEPH_FILE_MODE_RDWR; //赋予其读写权限
Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay());
if (cap) {
cap->set_wanted(0);
// put locks in excl mode
newi->filelock.set_state(LOCK_EXCL);
newi->authlock.set_state(LOCK_EXCL);
newi->xattrlock.set_state(LOCK_EXCL);
cap->issue_norevoke(CEPH_CAP_AUTH_EXCL|CEPH_CAP_AUTH_SHARED|
CEPH_CAP_XATTR_EXCL|CEPH_CAP_XATTR_SHARED);
}
// make sure this inode gets into the journal
le->metablob.add_opened_ino(newi->ino()); //将newi的ino加入到emetablob中的opened_ino
LogSegment *ls = mds->mdlog->get_current_segment(); //将ls指向mdlog当前位置
ls->open_files.push_back(&newi->item_open_file); //将newi添加到ls->open_files链表尾部
journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(mds, mdr, dn, newi, follows)); //见下边的函数
}
注意上面代码中的emetablob类中定义了相关类或结构体用来存储要写入日志的元数据。
函数的流程图如下:
以下为 predirty_journal_parents 函数
/*
* NOTE: we _have_ to delay the scatter if we are called during a
* rejoin, because we can't twiddle locks between when the
* rejoin_(weak|strong) is received and when we send the rejoin_ack.
* normally, this isn't a problem: a recover mds doesn't twiddle locks
* (no requests), and a survivor acks immediately. _except_ that
* during rejoin_(weak|strong) processing, we may complete a lock
* gather, and do a scatter_writebehind.. and we _can't_ twiddle the
* scatterlock state in that case or the lock states will get out of
* sync between the auth and replica.
*
* the simple solution is to never do the scatter here. instead, put
* the scatterlock on a list if it isn't already wrlockable. this is
* probably the best plan anyway, since we avoid too many
* scatters/locks under normal usage.
*/
/*
* some notes on dirlock/nestlock scatterlock semantics:
*
* the fragstat (dirlock) will never be updated without
* dirlock+nestlock wrlock held by the caller.
*
* the rstat (nestlock) _may_ get updated without a wrlock when nested
* data is pushed up the tree. this could be changed with some
* restructuring here, but in its current form we ensure that the
* fragstat+rstat _always_ reflect an accurrate summation over the dir
* frag, which is nice. and, we only need to track frags that need to
* be nudged (and not inodes with pending rstat changes that need to
* be pushed into the frag). a consequence of this is that the
* accounted_rstat on scatterlock sync may not match our current
* rstat. this is normal and expected.
*/
void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
CInode *in, CDir *parent,
int flags, int linkunlink,
snapid_t cfollows)
{
bool primary_dn = flags & PREDIRTY_PRIMARY; //根据传入参数进行相关设置
bool do_parent_mtime = flags & PREDIRTY_DIR;
bool shallow = flags & PREDIRTY_SHALLOW;
assert(mds->mdlog->entry_is_open());
// make sure stamp is set //设置时间戳
if (mut->get_mds_stamp() == utime_t())
mut->set_mds_stamp(ceph_clock_now(g_ceph_context));
if (in->is_base())
return;
dout(10) << "predirty_journal_parents"
<< (do_parent_mtime ? " do_parent_mtime":"")
<< " linkunlink=" << linkunlink
<< (primary_dn ? " primary_dn":" remote_dn")
<< (shallow ? " SHALLOW":"")
<< " follows " << cfollows
<< " " << *in << dendl;
if (!parent) {
assert(primary_dn);
parent = in->get_projected_parent_dn()->get_dir();
}
if (flags == 0 && linkunlink == 0) {
dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
blob->add_dir_context(parent);
return;
}
// build list of inodes to wrlock, dirty, and update
list<CInode*> lsi; //建立inode链表lsi用于记录wrlock、dirty、update等信息
CInode *cur = in;
CDentry *parentdn = NULL;
bool first = true;
while (parent) { //执行循环(对当前目录项以及此目录项所在路径上的每一项),记录mtime,rstat等信息,将目录项标记为脏
//assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
assert(parent->is_auth());
// opportunistically adjust parent dirfrag
CInode *pin = parent->get_inode();
// inode -> dirfrag
mut->auth_pin(parent);
mut->add_projected_fnode(parent);
fnode_t *pf = parent->project_fnode();
pf->version = parent->pre_dirty();
if (do_parent_mtime || linkunlink) {
assert(mut->wrlocks.count(&pin->filelock));
assert(cfollows == CEPH_NOSNAP);
// update stale fragstat?
parent->resync_accounted_fragstat();
if (do_parent_mtime) {
pf->fragstat.mtime = mut->get_op_stamp();
if (pf->fragstat.mtime > pf->rstat.rctime) {
dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
pf->rstat.rctime = pf->fragstat.mtime;
} else {
dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
}
}
if (linkunlink) {
dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
if (in->is_dir()) {
pf->fragstat.nsubdirs += linkunlink;
//pf->rstat.rsubdirs += linkunlink;
} else {
pf->fragstat.nfiles += linkunlink;
//pf->rstat.rfiles += linkunlink;
}
}
}
// rstat
if (!primary_dn) {
// don't update parent this pass
} else if (!linkunlink && !(parent->inode->nestlock.can_wrlock(-1) &&
parent->inode->versionlock.can_wrlock())) {
dout(20) << " unwritable parent nestlock " << parent->inode->nestlock
<< ", marking dirty rstat on " << *cur << dendl;
cur->mark_dirty_rstat();
} else {
// if we don't hold a wrlock reference on this nestlock, take one,
// because we are about to write into the dirfrag fnode and that needs
// to commit before the lock can cycle.
if (linkunlink) {
assert(parent->inode->nestlock.get_num_wrlocks() || mut->is_slave());
}
if (mut->wrlocks.count(&parent->inode->nestlock) == 0) {
dout(10) << " taking wrlock on " << parent->inode->nestlock << " on " << *parent->inode << dendl;
mds->locker->wrlock_force(&parent->inode->nestlock, mut);
}
// now we can project the inode rstat diff the dirfrag
SnapRealm *prealm = parent->inode->find_snaprealm();
snapid_t follows = cfollows;
if (follows == CEPH_NOSNAP)
follows = prealm->get_newest_seq();
snapid_t first = follows+1;
// first, if the frag is stale, bring it back in sync.
parent->resync_accounted_rstat();
// now push inode rstats into frag
project_rstat_inode_to_frag(cur, parent, first, linkunlink, prealm);
cur->clear_dirty_rstat();
}
bool stop = false;
if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
stop = true;
}
// delay propagating until later?
if (!stop && !first &&
g_conf->mds_dirstat_min_interval > 0) {
double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
if (since_last_prop < g_conf->mds_dirstat_min_interval) {
dout(10) << "predirty_journal_parents last prop " << since_last_prop
<< " < " << g_conf->mds_dirstat_min_interval
<< ", stopping" << dendl;
stop = true;
} else {
dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
}
}
// can cast only because i'm passing nowait=true in the sole user
MDRequestRef mdmut =
ceph::static_pointer_cast<MDRequestImpl,MutationImpl>(mut);
if (!stop &&
mut->wrlocks.count(&pin->nestlock) == 0 &&
(!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
//true
!mds->locker->wrlock_start(&pin->nestlock, mdmut, true)
)) { // ** do not initiate.. see above comment **
dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
<< " on " << *pin << dendl;
stop = true;
}
if (stop) {
dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
mds->locker->mark_updated_scatterlock(&pin->nestlock);
mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
mut->add_updated_lock(&pin->nestlock);
if (do_parent_mtime || linkunlink) {
mds->locker->mark_updated_scatterlock(&pin->filelock);
mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
mut->add_updated_lock(&pin->filelock);
}
break;
}
if (!mut->wrlocks.count(&pin->versionlock))
mds->locker->local_wrlock_grab(&pin->versionlock, mut);
assert(mut->wrlocks.count(&pin->nestlock) ||
mut->is_slave());
pin->last_dirstat_prop = mut->get_mds_stamp();
// dirfrag -> diri
mut->auth_pin(pin);
mut->add_projected_inode(pin);
lsi.push_front(pin);
pin->pre_cow_old_inode(); // avoid cow mayhem!
inode_t *pi = pin->project_inode();
pi->version = pin->pre_dirty();
// dirstat
if (do_parent_mtime || linkunlink) {
dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
bool touched_mtime = false;
pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, touched_mtime);
pf->accounted_fragstat = pf->fragstat;
if (touched_mtime)
pi->mtime = pi->ctime = pi->dirstat.mtime;
dout(20) << "predirty_journal_parents gives " << pi->dirstat << " on " << *pin << dendl;
if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
if (pi->dirstat.size() < 0)
assert(!"negative dirstat size" == g_conf->mds_verify_scatter);
if (pi->dirstat.size() != pf->fragstat.size()) {
mds->clog->error() << "unmatched fragstat size on single dirfrag "
<< parent->dirfrag() << ", inode has " << pi->dirstat
<< ", dirfrag has " << pf->fragstat << "\n";
// trust the dirfrag for now
pi->dirstat = pf->fragstat;
assert(!"unmatched fragstat size" == g_conf->mds_verify_scatter);
}
}
}
/*
* the rule here is to follow the _oldest_ parent with dirty rstat
* data. if we don't propagate all data, we add ourselves to the
* nudge list. that way all rstat data will (eventually) get
* pushed up the tree.
*
* actually, no. for now, silently drop rstats for old parents. we need
* hard link backpointers to do the above properly.
*/
// stop?
if (pin->is_base())
break;
parentdn = pin->get_projected_parent_dn();
assert(parentdn);
// rstat
if (primary_dn) {
dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
// first, if the frag is stale, bring it back in sync.
parent->resync_accounted_rstat();
for (map<snapid_t,old_rstat_t>::iterator p = parent->dirty_old_rstat.begin();
p != parent->dirty_old_rstat.end();
++p)
project_rstat_frag_to_inode(p->second.rstat, p->second.accounted_rstat, p->second.first, p->first, pin, true);//false);
parent->dirty_old_rstat.clear();
project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
pf->accounted_rstat = pf->rstat;
if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
if (pi->rstat.rbytes != pf->rstat.rbytes) {
mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
<< parent->dirfrag() << ", inode has " << pi->rstat
<< ", dirfrag has " << pf->rstat << "\n";
// trust the dirfrag for now
pi->rstat = pf->rstat;
assert(!"unmatched rstat rbytes" == g_conf->mds_verify_scatter);
}
}
}
parent->check_rstats();
broadcast_quota_to_client(pin);
// next parent!
cur = pin;
parent = parentdn->get_dir(); //将指针指向当前目录的父目录
linkunlink = 0;
do_parent_mtime = false;
primary_dn = true;
first = false;
}
// now, stick it in the blob
assert(parent);
assert(parent->is_auth());
blob->add_dir_context(parent);
blob->add_dir(parent, true);
for (list<CInode*>::iterator p = lsi.begin(); //执行循环将lsi中记录的inode信息加入到metablob中并标记为脏
p != lsi.end();
++p) {
CInode *cur = *p;
journal_dirty_inode(mut.get(), blob, cur);
}
}
predirty_journal_parents函数主要通过一个while循环,将传入的参数节点的父目录全部标记为脏,并将这些目录存入参数中类metablob的相应结构体内,使得每个dentry与inode相对应。
以下为journal_and_reply函数
/*******
* some generic stuff for finishing off requests
*/
void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSInternalContextBase *fin)
{
dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
// note trace items for eventual reply.
mdr->tracei = in;
if (in)
mdr->pin(in);
mdr->tracedn = dn;
if (dn)
mdr->pin(dn);
early_reply(mdr, in, dn); //此函数判断一些条件若符合则刷入mdlog并返回,否则reply
mdr->committing = true;
submit_mdlog_entry(le, fin, mdr, __func__);
if (mdr->client_request && mdr->client_request->is_replay()) {
if (mds->queue_one_replay()) { //若队列不为空则排队
dout(10) << " queued next replay op" << dendl;
} else { //队列为空则刷入mdlog
dout(10) << " journaled last replay op, flushing" << dendl;
mdlog->flush();
}
} else if (mdr->did_early_reply) //若执行了early_reply则drop_rdlocks
mds->locker->drop_rdlocks(mdr.get());
else //以上不符合则刷入mdlog
mdlog->flush();
}
最终将mdlog刷出。