从handle_client_mkdir函数操作来分析MDlog与LogSegment过程

整个函数的要点如下:

1、 CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false); //获取该directory的目录项

2、 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode); //创建该dir的Cinode新节点newi

3、 CDir *newdir = newi->get_or_open_dirfrag(mds->mdcache, frag_t()); //获取或创建一个CDir项newdir

4、将这些改变记入日志MDLog中

首先看下handle_client_mkdir函数内容

// MKDIR
/* This function takes responsibility for the passed mdr*/
void Server::handle_client_mkdir(MDRequestRef& mdr)
{
  MClientRequest *req = mdr->client_request;
  set<SimpleLock*> rdlocks, wrlocks, xlocks;
  CDentry *dn = rdlock_path_xlock_dentry(mdr, 0, rdlocks, wrlocks, xlocks, false, false, false);  //获取该directory的目录项
  if (!dn) return;
  if (mdr->snapid != CEPH_NOSNAP) {
    respond_to_request(mdr, -EROFS);
    return;
  }
  CInode *diri = dn->get_dir()->get_inode();
  rdlocks.insert(&diri->authlock);
  if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
    return;

  // new inode
  SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
  snapid_t follows = realm->get_newest_seq();

  unsigned mode = req->head.args.mkdir.mode;
  mode &= ~S_IFMT;
  mode |= S_IFDIR;
  CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode);    //创建该dir的新节点newi
  assert(newi);

  // it's a directory.
  dn->push_projected_linkage(newi);

  newi->inode.version = dn->pre_dirty();
  newi->inode.rstat.rsubdirs = 1;
  newi->inode.update_backtrace();

  dout(12) << " follows " << follows << dendl;
  if (follows >= dn->first)
    dn->first = follows + 1;
  newi->first = dn->first;

  // ...and that new dir is empty.
  CDir *newdir = newi->get_or_open_dirfrag(mds->mdcache, frag_t());      //获取或创建newdir 
  newdir->mark_complete();
  newdir->fnode.version = newdir->pre_dirty();

  // prepare finisher
 // 以下工作为将上面的改变记入日志MDlog中 mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "mkdir"); //为类EUpdate对象le分配空间并记录到mdlog此操作为mkdir,日志记录元数据的操作均以事件的形式记录,如ECommitted、EUpdate等 mdlog->start_entry(le); //找到将要写入日志的起始地址 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid()); //将req加入到用于记录客户端请求的pair容器中;metablob类注释为a bunch of metadata in the journal (日志中的一堆元数据) journal_allocated_inos(mdr, &le->metablob); //为即将写入日志的inode的成员变量赋予对应的值 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); //为将当前目录的目录项标记为脏,见下边具体代码分析 le->metablob.add_primary_dentry(dn, newi, true, true); //添加主目录(将其与newi节点对应)到链表lump中,该链表记录相关的dentry+inode,并标记此inode was last journaled le->metablob.add_new_dir(newdir); // dirty AND complete AND new //标记newdir为dirty AND complete AND new // issue a cap on the directory int cmode = CEPH_FILE_MODE_RDWR; //赋予其读写权限 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr->session, realm, req->is_replay()); if (cap) { cap->set_wanted(0); // put locks in excl mode newi->filelock.set_state(LOCK_EXCL); newi->authlock.set_state(LOCK_EXCL); newi->xattrlock.set_state(LOCK_EXCL); cap->issue_norevoke(CEPH_CAP_AUTH_EXCL|CEPH_CAP_AUTH_SHARED| CEPH_CAP_XATTR_EXCL|CEPH_CAP_XATTR_SHARED); } // make sure this inode gets into the journal le->metablob.add_opened_ino(newi->ino()); //将newi的ino加入到emetablob中的opened_ino LogSegment *ls = mds->mdlog->get_current_segment(); //将ls指向mdlog当前位置 ls->open_files.push_back(&newi->item_open_file);  //将newi添加到ls->open_files链表尾部 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(mds, mdr, dn, newi, follows)); //见下边的函数 }

注意上面代码中的emetablob类中定义了相关类或结构体用来存储要写入日志的元数据。

函数的流程图如下:

以下为 predirty_journal_parents 函数

/*
 * NOTE: we _have_ to delay the scatter if we are called during a
 * rejoin, because we can't twiddle locks between when the
 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
 * normally, this isn't a problem: a recover mds doesn't twiddle locks
 * (no requests), and a survivor acks immediately.  _except_ that
 * during rejoin_(weak|strong) processing, we may complete a lock
 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
 * scatterlock state in that case or the lock states will get out of
 * sync between the auth and replica.
 *
 * the simple solution is to never do the scatter here.  instead, put
 * the scatterlock on a list if it isn't already wrlockable.  this is
 * probably the best plan anyway, since we avoid too many
 * scatters/locks under normal usage.
 */
/*
 * some notes on dirlock/nestlock scatterlock semantics:
 *
 * the fragstat (dirlock) will never be updated without
 * dirlock+nestlock wrlock held by the caller.
 *
 * the rstat (nestlock) _may_ get updated without a wrlock when nested
 * data is pushed up the tree.  this could be changed with some
 * restructuring here, but in its current form we ensure that the
 * fragstat+rstat _always_ reflect an accurrate summation over the dir
 * frag, which is nice.  and, we only need to track frags that need to
 * be nudged (and not inodes with pending rstat changes that need to
 * be pushed into the frag).  a consequence of this is that the
 * accounted_rstat on scatterlock sync may not match our current
 * rstat.  this is normal and expected.
 */
void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
                       CInode *in, CDir *parent,
                       int flags, int linkunlink,
                       snapid_t cfollows)
{
  bool primary_dn = flags & PREDIRTY_PRIMARY;              //根据传入参数进行相关设置
  bool do_parent_mtime = flags & PREDIRTY_DIR;
  bool shallow = flags & PREDIRTY_SHALLOW;

  assert(mds->mdlog->entry_is_open());

  // make sure stamp is set //设置时间戳
  if (mut->get_mds_stamp() == utime_t())
    mut->set_mds_stamp(ceph_clock_now(g_ceph_context));

  if (in->is_base())
    return;

  dout(10) << "predirty_journal_parents"
       << (do_parent_mtime ? " do_parent_mtime":"")
       << " linkunlink=" <<  linkunlink
       << (primary_dn ? " primary_dn":" remote_dn")
       << (shallow ? " SHALLOW":"")
       << " follows " << cfollows
       << " " << *in << dendl;

  if (!parent) {
    assert(primary_dn);
    parent = in->get_projected_parent_dn()->get_dir();
  }

  if (flags == 0 && linkunlink == 0) {
    dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
    blob->add_dir_context(parent);
    return;
  }

  // build list of inodes to wrlock, dirty, and update  
  list<CInode*> lsi;           //建立inode链表lsi用于记录wrlock、dirty、update等信息
  CInode *cur = in;
  CDentry *parentdn = NULL;
  bool first = true;
  while (parent) {          //执行循环(对当前目录项以及此目录项所在路径上的每一项),记录mtime,rstat等信息,将目录项标记为脏
    //assert(cur->is_auth() || !primary_dn);  // this breaks the rename auth twiddle hack
    assert(parent->is_auth());
    
    // opportunistically adjust parent dirfrag
    CInode *pin = parent->get_inode();

    // inode -> dirfrag
    mut->auth_pin(parent);
    mut->add_projected_fnode(parent);

    fnode_t *pf = parent->project_fnode();
    pf->version = parent->pre_dirty();

    if (do_parent_mtime || linkunlink) {
      assert(mut->wrlocks.count(&pin->filelock));
      assert(cfollows == CEPH_NOSNAP);
      
      // update stale fragstat?
      parent->resync_accounted_fragstat();

      if (do_parent_mtime) {
    pf->fragstat.mtime = mut->get_op_stamp();
    if (pf->fragstat.mtime > pf->rstat.rctime) {
      dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
      pf->rstat.rctime = pf->fragstat.mtime;
    } else {
      dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
    }
      }
      if (linkunlink) {
    dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
    if (in->is_dir()) {
      pf->fragstat.nsubdirs += linkunlink;
      //pf->rstat.rsubdirs += linkunlink;
    } else {
       pf->fragstat.nfiles += linkunlink;
       //pf->rstat.rfiles += linkunlink;
    }
      }
    }

    
    // rstat
    if (!primary_dn) {
      // don't update parent this pass
    } else if (!linkunlink && !(parent->inode->nestlock.can_wrlock(-1) &&
                    parent->inode->versionlock.can_wrlock())) {
      dout(20) << " unwritable parent nestlock " << parent->inode->nestlock
           << ", marking dirty rstat on " << *cur << dendl;
      cur->mark_dirty_rstat();
   } else {
      // if we don't hold a wrlock reference on this nestlock, take one,
      // because we are about to write into the dirfrag fnode and that needs
      // to commit before the lock can cycle.
     if (linkunlink) {
       assert(parent->inode->nestlock.get_num_wrlocks() || mut->is_slave());
     }

      if (mut->wrlocks.count(&parent->inode->nestlock) == 0) {
    dout(10) << " taking wrlock on " << parent->inode->nestlock << " on " << *parent->inode << dendl;
    mds->locker->wrlock_force(&parent->inode->nestlock, mut);
      }

      // now we can project the inode rstat diff the dirfrag
      SnapRealm *prealm = parent->inode->find_snaprealm();
      
      snapid_t follows = cfollows;
      if (follows == CEPH_NOSNAP)
    follows = prealm->get_newest_seq();
      
      snapid_t first = follows+1;

      // first, if the frag is stale, bring it back in sync.
      parent->resync_accounted_rstat();

      // now push inode rstats into frag
      project_rstat_inode_to_frag(cur, parent, first, linkunlink, prealm);
      cur->clear_dirty_rstat();
    }

    bool stop = false;
    if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
      dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
      stop = true;
    }

    // delay propagating until later?
    if (!stop && !first &&
    g_conf->mds_dirstat_min_interval > 0) {
      double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
      if (since_last_prop < g_conf->mds_dirstat_min_interval) {
    dout(10) << "predirty_journal_parents last prop " << since_last_prop
         << " < " << g_conf->mds_dirstat_min_interval
         << ", stopping" << dendl;
    stop = true;
      } else {
    dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
      }
    }

    // can cast only because i'm passing nowait=true in the sole user
    MDRequestRef mdmut =
      ceph::static_pointer_cast<MDRequestImpl,MutationImpl>(mut);
    if (!stop &&
    mut->wrlocks.count(&pin->nestlock) == 0 &&
    (!pin->versionlock.can_wrlock() ||                   // make sure we can take versionlock, too
     //true
     !mds->locker->wrlock_start(&pin->nestlock, mdmut, true)
     )) {  // ** do not initiate.. see above comment **
      dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
           << " on " << *pin << dendl;
      stop = true;
    }
    if (stop) {
      dout(10) << "predirty_journal_parents stop.  marking nestlock on " << *pin << dendl;
      mds->locker->mark_updated_scatterlock(&pin->nestlock);
      mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
      mut->add_updated_lock(&pin->nestlock);
      if (do_parent_mtime || linkunlink) {
    mds->locker->mark_updated_scatterlock(&pin->filelock);
    mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
    mut->add_updated_lock(&pin->filelock);
      }
      break;
    }
    if (!mut->wrlocks.count(&pin->versionlock))
      mds->locker->local_wrlock_grab(&pin->versionlock, mut);

    assert(mut->wrlocks.count(&pin->nestlock) ||
       mut->is_slave());
    
    pin->last_dirstat_prop = mut->get_mds_stamp();

    // dirfrag -> diri
    mut->auth_pin(pin);
    mut->add_projected_inode(pin);
    lsi.push_front(pin);

    pin->pre_cow_old_inode();  // avoid cow mayhem!

    inode_t *pi = pin->project_inode();
    pi->version = pin->pre_dirty();

    // dirstat
    if (do_parent_mtime || linkunlink) {
      dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
      dout(20) << "predirty_journal_parents         - " << pf->accounted_fragstat << dendl;
      bool touched_mtime = false;
      pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, touched_mtime);
      pf->accounted_fragstat = pf->fragstat;
      if (touched_mtime)
    pi->mtime = pi->ctime = pi->dirstat.mtime;
      dout(20) << "predirty_journal_parents     gives " << pi->dirstat << " on " << *pin << dendl;

      if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
    if (pi->dirstat.size() < 0)
      assert(!"negative dirstat size" == g_conf->mds_verify_scatter);
    if (pi->dirstat.size() != pf->fragstat.size()) {
      mds->clog->error() << "unmatched fragstat size on single dirfrag "
         << parent->dirfrag() << ", inode has " << pi->dirstat
         << ", dirfrag has " << pf->fragstat << "\n";
      
      // trust the dirfrag for now
      pi->dirstat = pf->fragstat;

      assert(!"unmatched fragstat size" == g_conf->mds_verify_scatter);
    }
      }
    }

    /* 
     * the rule here is to follow the _oldest_ parent with dirty rstat
     * data.  if we don't propagate all data, we add ourselves to the
     * nudge list.  that way all rstat data will (eventually) get
     * pushed up the tree.
     *
     * actually, no.  for now, silently drop rstats for old parents.  we need 
     * hard link backpointers to do the above properly.
     */

    // stop?
    if (pin->is_base())
      break;
    parentdn = pin->get_projected_parent_dn();
    assert(parentdn);

    // rstat
    if (primary_dn) {

      dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;

      // first, if the frag is stale, bring it back in sync.
      parent->resync_accounted_rstat();

      for (map<snapid_t,old_rstat_t>::iterator p = parent->dirty_old_rstat.begin();
       p != parent->dirty_old_rstat.end();
       ++p)
    project_rstat_frag_to_inode(p->second.rstat, p->second.accounted_rstat, p->second.first, p->first, pin, true);//false);
      parent->dirty_old_rstat.clear();
      project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);

      pf->accounted_rstat = pf->rstat;

      if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
    if (pi->rstat.rbytes != pf->rstat.rbytes) { 
      mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
          << parent->dirfrag() << ", inode has " << pi->rstat
          << ", dirfrag has " << pf->rstat << "\n";
      
      // trust the dirfrag for now
      pi->rstat = pf->rstat;

      assert(!"unmatched rstat rbytes" == g_conf->mds_verify_scatter);
    }
      }
    }

    parent->check_rstats();
    broadcast_quota_to_client(pin);
    // next parent!
    cur = pin;
    parent = parentdn->get_dir();              //将指针指向当前目录的父目录
    linkunlink = 0;
    do_parent_mtime = false;
    primary_dn = true;
    first = false;
  }

  // now, stick it in the blob
  assert(parent);
  assert(parent->is_auth());
  blob->add_dir_context(parent);
  blob->add_dir(parent, true);
  for (list<CInode*>::iterator p = lsi.begin();       //执行循环将lsi中记录的inode信息加入到metablob中并标记为脏
       p != lsi.end();
       ++p) {
    CInode *cur = *p;
    journal_dirty_inode(mut.get(), blob, cur);
  }
 
}

 

predirty_journal_parents函数主要通过一个while循环,将传入的参数节点的父目录全部标记为脏,并将这些目录存入参数中类metablob的相应结构体内,使得每个dentry与inode相对应。

 

以下为journal_and_reply函数

/*******
 * some generic stuff for finishing off requests
 */
void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSInternalContextBase *fin)
{
  dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;

  // note trace items for eventual reply.
  mdr->tracei = in;
  if (in)
    mdr->pin(in);

  mdr->tracedn = dn;
  if (dn)
    mdr->pin(dn);

  early_reply(mdr, in, dn);   //此函数判断一些条件若符合则刷入mdlog并返回,否则reply
  
  mdr->committing = true;
  submit_mdlog_entry(le, fin, mdr, __func__);
  
  if (mdr->client_request && mdr->client_request->is_replay()) {
    if (mds->queue_one_replay()) {         //若队列不为空则排队
      dout(10) << " queued next replay op" << dendl;
    } else {          //队列为空则刷入mdlog
      dout(10) << " journaled last replay op, flushing" << dendl;
      mdlog->flush();
    }
  } else if (mdr->did_early_reply)      //若执行了early_reply则drop_rdlocks
    mds->locker->drop_rdlocks(mdr.get());
  else        //以上不符合则刷入mdlog
    mdlog->flush();
}

最终将mdlog刷出。

 

转载于:https://www.cnblogs.com/noblemore/p/4922845.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值