mongodb源码分析(十)数据的插入
本文我们分析mongodb中数据的插入流程.插入的简单流程可以归结于如下:
1. 如果存在对应collection则从collection中分配空间,然后将数据保存到分配的空间中,不存在则先从database中分配对应的collection,若database不存在则分配database,建立xx.ns和xx.0 等文件.
2. 根据插入数据更新collection中的索引.
下面来看代码,根据前面的分析我们知道插入操作的入口函数为:receivedInsert.
- void receivedInsert(Message& m, CurOp& op) {
- BSONObj first = d.nextJsObj();
- vector<BSONObj> multi;
- while (d.moreJSObjs()){//批量数据的插入
- if (multi.empty()) // first pass
- multi.push_back(first);
- multi.push_back( d.nextJsObj() );
- }
- while ( true ) {
- Lock::DBWrite lk(ns);
- if ( handlePossibleShardedMessage( m , 0 ) )
- return;
- Client::Context ctx(ns);
- if( !multi.empty() ) {
- const bool keepGoing = d.reservedField() & InsertOption_ContinueOnError;
- insertMulti(keepGoing, ns, multi);//循环调用checkAndInsert做插入操作,keepGoing为true表示当插入操作错误时继续进行插入操作,否则终止插入操作
- return;
- }
- checkAndInsert(ns, first);
- globalOpCounters.incInsertInWriteLock(1);
- return;
- }
- }
- }
- void checkAndInsert(const char *ns, /*modifies*/BSONObj& js) {
- uassert( 10059 , "object to insert too large", js.objsize() <= BSONObjMaxUserSize);//单条doc超过BSONObjMaxUserSize的不允许插入,可以通过修改代码的方式调整这个值
- {
- // check no $ modifiers. note we only check top level. (scanning deep would be quite expensive)
- BSONObjIterator i( js );//field中不允许以'$'开始
- while ( i.more() ) {
- BSONElement e = i.next();
- uassert( 13511 , "document to insert can't have $ fields" , e.fieldName()[0] != '$' );
- }
- }
- theDataFileMgr.insertWithObjMod(ns, js, false); // js may be modified in the call to add an _id field.
- logOp("i", ns, js);//为master/slave或者replset记录操作.
- }
- DiskLoc DataFileMgr::insertWithObjMod(const char *ns, BSONObj &o, bool god) {
- bool addedID = false;
- DiskLoc loc = insert( ns, o.objdata(), o.objsize(), god, true, &addedID );
- if( addedID && !loc.isNull() )
- o = BSONObj::make( loc.rec() );
- return loc;
- }
- DiskLoc DataFileMgr::insert(const char *ns, const void *obuf, int len, bool god, bool mayAddIndex, bool *addedID) {
- bool wouldAddIndex = false;
- {
- const char *sys = strstr(ns, "system.");
- if ( sys && !insert_checkSys(sys, ns, wouldAddIndex, obuf, god) )
- return DiskLoc();
- }
- bool addIndex = wouldAddIndex && mayAddIndex;//这是插入index的操作
- NamespaceDetails *d = nsdetails(ns);
- if ( d == 0 ) {//当前collection并不存在,首先分配一个collection,具体操作时根据插入的数据分配一个extent,分配一个namespacedetails,并且根据记录该namespacedetails.god为false则创建一个_id的索引,将该collection记录到system.namespaces collection中
- d = insert_newNamespace(ns, len, god);
- }
- NamespaceDetails *tableToIndex = 0;
- string tabletoidxns;
- BSONObj fixedIndexObject;
- if ( addIndex ) {//插入index的操作,如db.coll.ensureIndex({x:1})就是走这里的流程
- verify( obuf );
- BSONObj io((const char *) obuf);
- if( !prepareToBuildIndex(io, god, tabletoidxns, tableToIndex, fixedIndexObject ) ) {
- // prepare creates _id itself, or this indicates to fail the build silently (such
- // as if index already exists)
- return DiskLoc();
- }
- if ( ! fixedIndexObject.isEmpty() ) {
- obuf = fixedIndexObject.objdata();
- len = fixedIndexObject.objsize();
- }
- }
- int addID = 0; // 0 if not adding _id; if adding, the length of that new element
- if( !god ) {//没有_id数据则生成一个_id
- /* Check if we have an _id field. If we don't, we'll add it.
- Note that btree buckets which we insert aren't BSONObj's, but in that case god==true.
- */
- BSONObj io((const char *) obuf);
- BSONElement idField = io.getField( "_id" );
- uassert( 10099 , "_id cannot be an array", idField.type() != Array );
- // we don't add _id for capped collections in local as they don't have an _id index
- if( idField.eoo() && !wouldAddIndex &&
- !str::equals( nsToDatabase( ns ).c_str() , "local" ) && d->haveIdIndex() ) {
- if( addedID )
- *addedID = true;
- addID = len;
- idToInsert_.oid.init();
- len += idToInsert.size();
- }
- BSONElementManipulator::lookForTimestamps( io );
- }
- int lenWHdr = d->getRecordAllocationSize( len + Record::HeaderSize );//得到分配数据的大小,这个大小值取决于是否设置了Flag_UsePowerOf2Sizes,设置了则分配的数据始终为2的n次方,可通过db.coll.runCommand({collMod:"coll","usePowerOf2Sizes":true})设置,没设置这个标志位就是需要分配的空间乘以paddingfactor,这个值会动态调整,最小为1,最大为2.具体会根据更新数据时是否原doc长度不够来调整.
- // If the collection is capped, check if the new object will violate a unique index
- // constraint before allocating space.
- if ( d->nIndexes && d->isCapped() && !god ) {//唯一性检查,如果创建索引时指定了unique为true.
- checkNoIndexConflicts( d, BSONObj( reinterpret_cast<const char *>( obuf ) ) );
- }
- bool earlyIndex = true;
- DiskLoc loc;
- if( addID || tableToIndex || d->isCapped() ) {
- // if need id, we don't do the early indexing. this is not the common case so that is sort of ok
- earlyIndex = false;//实际的空间分配,分配的原理见<a href="http://blog.csdn.net/yhjj0108/article/details/8278041">http://blog.csdn.net/yhjj0108/article/details/8278041</a>
- loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god);
- }
- else {
- loc = d->allocWillBeAt(ns, lenWHdr);
- if( loc.isNull() ) {
- // need to get a new extent so we have to do the true alloc now (not common case)
- earlyIndex = false;
- loc = allocateSpaceForANewRecord(ns, d, lenWHdr, god);
- }
- }
- if ( loc.isNull() ) {
- log() << "insert: couldn't alloc space for object ns:" << ns << " capped:" << d->isCapped() << endl;
- verify(d->isCapped());
- return DiskLoc();
- }
- if( earlyIndex ) {
- // add record to indexes using two step method so we can do the reading outside a write lock
- if ( d->nIndexes ) {
- verify( obuf );
- BSONObj obj((const char *) obuf);
- try {//从要插入的数据obj中取出所有collection中的field,然后将其插入到相应的索引Btree中
- indexRecordUsingTwoSteps(ns, d, obj, loc, true);
- }
- catch( AssertionException& ) {
- // should be a dup key error on _id index
- dassert( !tableToIndex && !d->isCapped() );
- // no need to delete/rollback the record as it was not added yet
- throw;
- }
- }
- // really allocate now
- DiskLoc real = allocateSpaceForANewRecord(ns, d, lenWHdr, god);
- verify( real == loc );
- }
- Record *r = loc.rec();
- {//将实际的数据复制到空间中
- verify( r->lengthWithHeaders() >= lenWHdr );
- r = (Record*) getDur().writingPtr(r, lenWHdr);
- if( addID ) {
- /* a little effort was made here to avoid a double copy when we add an ID */
- ((int&)*r->data()) = *((int*) obuf) + idToInsert.size();
- memcpy(r->data()+4, idToInsert.rawdata(), idToInsert.size());
- memcpy(r->data()+4+idToInsert.size(), ((char *)obuf)+4, addID-4);
- }
- else {
- if( obuf ) // obuf can be null from internal callers
- memcpy(r->data(), obuf, len);
- }
- }
- addRecordToRecListInExtent(r, loc);//更新extent链表信息
- /* durability todo : this could be a bit annoying / slow to record constantly */
- {
- NamespaceDetails::Stats *s = getDur().writing(&d->stats);
- s->datasize += r->netLength();
- s->nrecords++;
- }
- // we don't bother resetting query optimizer stats for the god tables - also god is true when adding a btree bucket
- if ( !god )
- NamespaceDetailsTransient::get( ns ).notifyOfWriteOp();
- if ( tableToIndex ) {//新添加了一个索引,这里将建立具体的索引信息
- insert_makeIndex(tableToIndex, tabletoidxns, loc);
- }
- /* add this record to our indexes */
- if ( !earlyIndex && d->nIndexes ) {
- BSONObj obj(r->data());
- // not sure which of these is better -- either can be used. oldIndexRecord may be faster,
- // but twosteps handles dup key errors more efficiently.
- //oldIndexRecord(d, obj, loc);
- indexRecordUsingTwoSteps(ns, d, obj, loc, false);
- }
- d->paddingFits();
- return loc;
- }
- void indexRecordUsingTwoSteps(const char *ns, NamespaceDetails *d, BSONObj obj,
- DiskLoc loc, bool shouldBeUnlocked) {
- vector<int> multi;
- vector<BSONObjSet> multiKeys;
- IndexInterface::IndexInserter inserter;
- // Step 1, read phase.
- int n = d->nIndexesBeingBuilt();//插入索引的动作比如说有索引{x:1},则取出插入的数据如
- {//{x:1,y:1},这里取出1这个数据插入到btree索引中对于比较复杂的multikey索引如插入数据
- BSONObjSet keys;//为{x:[1,2,3,4,5]},则分别取出1,2,3,4,5插入到btree中并把索引x标为multikey索引
- for ( int i = 0; i < n; i++ ) {//这些需要注意的是当从obj中取出的值为空时如果索引不为sparse,则将key设置为nullkey
- // this call throws on unique constraint violation. we haven't done any writes yet so that is fine.
- fetchIndexInserters(/*out*/keys, inserter, d, i, obj, loc);//这里就是从obj中取出索引域的值,然后造出该值应该插入到索引的位置
- if( keys.size() > 1 ) {//多值索引
- multi.push_back(i);
- multiKeys.push_back(BSONObjSet());
- multiKeys[multiKeys.size()-1].swap(keys);
- }
- keys.clear();
- }
- }//完成具体的插入动作
- inserter.finishAllInsertions(); // Step 2, write phase.
- // now finish adding multikeys
- for( unsigned j = 0; j < multi.size(); j++ ) {//多值索引的插入部分
- unsigned i = multi[j];
- BSONObjSet& keys = multiKeys[j];
- IndexDetails& idx = d->idx(i);
- IndexInterface& ii = idx.idxInterface();
- Ordering ordering = Ordering::make(idx.keyPattern());
- d->setIndexIsMultikey(ns, i);//将该索引标识为多值索引
- for( BSONObjSet::iterator k = ++keys.begin()/*skip 1*/; k != keys.end(); k++ ) {
- try {
- ii.bt_insert(idx.head, loc, *k, ordering, !idx.unique(), idx);
- } catch (AssertionException& e) {
- if( e.getCode() == 10287 && (int) i == d->nIndexes ) {
- DEV log() << "info: caught key already in index on bg indexing (ok)" << endl;
- }
- else {
- /* roll back previously added index entries
- note must do self index as it is multikey and could require some cleanup itself
- */
- for( int j = 0; j < n; j++ ) {
- try {
- _unindexRecord(d->idx(j), obj, loc, false);
- }
- catch(...) {
- log(3) << "unindex fails on rollback after unique key constraint prevented insert\n";
- }
- }
- throw;
- }
- }
- }
- }
- }
- void NOINLINE_DECL insert_makeIndex(NamespaceDetails *tableToIndex, const string& tabletoidxns, const DiskLoc& loc) {
- BSONObj info = loc.obj();
- bool background = info["background"].trueValue();
- if( background && cc().isSyncThread() ) {
- /* don't do background indexing on slaves. there are nuances. this could be added later
- but requires more code.
- */
- log() << "info: indexing in foreground on this replica; was a background index build on the primary" << endl;
- background = false;
- }
- int idxNo = tableToIndex->nIndexes;//添加索引结构到namespacedetail或者namespacedetail::extra中
- IndexDetails& idx = tableToIndex->addIndex(tabletoidxns.c_str(), !background); // clear transient info caches so they refresh; increments nIndexes
- getDur().writingDiskLoc(idx.info) = loc;
- try {//根据新创建的索引快速建立该collection的索引
- buildAnIndex(tabletoidxns, tableToIndex, idx, idxNo, background);
- }
- }
- void buildAnIndex(string ns, NamespaceDetails *d, IndexDetails& idx, int idxNo, bool background) {
- if( inDBRepair || !background ) {//同步建立索引
- n = fastBuildIndex(ns.c_str(), d, idx, idxNo);
- verify( !idx.head.isNull() );
- }
- else {//开启一个线程来建立索引
- BackgroundIndexBuildJob j(ns.c_str());
- n = j.go(ns, d, idx, idxNo);
- }
- }
- unsigned long long fastBuildIndex(const char *ns, NamespaceDetails *d, IndexDetails& idx, int idxNo) {
- CurOp * op = cc().curop();
- bool dupsAllowed = !idx.unique();
- bool dropDups = idx.dropDups() || inDBRepair;
- BSONObj order = idx.keyPattern();
- getDur().writingDiskLoc(idx.head).Null();
- /* get and sort all the keys ----- */
- ProgressMeterHolder pm( op->setMessage( "index: (1/3) external sort" , d->stats.nrecords , 10 ) );
- SortPhaseOne _ours;
- SortPhaseOne *phase1 = precalced;
- if( phase1 == 0 ) {
- phase1 = &_ours;
- SortPhaseOne& p1 = *phase1;
- shared_ptr<Cursor> c = theDataFileMgr.findAll(ns);
- p1.sorter.reset( new BSONObjExternalSorter(idx.idxInterface(), order) );
- p1.sorter->hintNumObjects( d->stats.nrecords );
- const IndexSpec& spec = idx.getSpec();
- while ( c->ok() ) {//每读出1000条索引使用快排对其排序,然后输出到一个文件中
- BSONObj o = c->current();//记录文件句柄
- DiskLoc loc = c->currLoc();
- p1.addKeys(spec, o, loc);
- c->advance();
- pm.hit();
- if ( logLevel > 1 && p1.n % 10000 == 0 ) {
- printMemInfo( "\t iterating objects" );
- }
- };
- }
- pm.finished();
- BSONObjExternalSorter& sorter = *(phase1->sorter);
- // Ensure the index and external sorter have a consistent index interface (and sort order).
- fassert( 16408, &idx.idxInterface() == &sorter.getIndexInterface() );
- if( phase1->multi )//multi索引
- d->setIndexIsMultikey(ns, idxNo);
- if ( logLevel > 1 ) printMemInfo( "before final sort" );
- phase1->sorter->sort();
- if ( logLevel > 1 ) printMemInfo( "after final sort" );
- log(t.seconds() > 5 ? 0 : 1) << "\t external sort used : " << sorter.numFiles() << " files " << " in " << t.seconds() << " secs" << endl;
- set<DiskLoc> dupsToDrop;
- /* build index --- */
- if( idx.version() == 0 )//mongodb有两种索引,V0和V1,其中目前使用V1,10gen称V1索引比V0小25%的空间,V0是为了兼容2.0以前版本的系统,目前默认使用V1
- buildBottomUpPhases2And3<V0>(dupsAllowed, idx, sorter, dropDups, dupsToDrop, op, phase1, pm, t);
- else if( idx.version() == 1 ) //从sorter中读出输入并且将其插入到索引btree中,因为之前排过序,这里只需要顺序读出并做插入操作就行
- buildBottomUpPhases2And3<V1>(dupsAllowed, idx, sorter, dropDups, dupsToDrop, op, phase1, pm, t);
- else
- verify(false);
- if( dropDups ) //设置了unique且数据有重复,将重复的数据删除
- log() << "\t fastBuildIndex dupsToDrop:" << dupsToDrop.size() << endl;
- for( set<DiskLoc>::iterator i = dupsToDrop.begin(); i != dupsToDrop.end(); i++ ){
- theDataFileMgr.deleteRecord( ns, i->rec(), *i, false /* cappedOk */ , true /* noWarn */ , isMaster( ns ) /* logOp */ );
- getDur().commitIfNeeded();
- }
- return phase1->n;
- }
的是sparse索引和索引的建立,当已存储数据是建立索引为unique时重复的数据都会被删除.
原文链接:mongodb源码分析(十)数据的插入
作者: yhjj0108,杨浩
本文我们将删除,删除操作概括起来就是遍历将collection中数据对应的索引删除,然后是删除数据,最后将删除
的空间加入到之前文章描述的deletelist中.下面我们来看具体的代码吧.删除的入口是receiveDelete函数.
- void receivedDelete(Message& m, CurOp& op) {
- DbMessage d(m);
- const char *ns = d.getns();
- op.debug().ns = ns;
- int flags = d.pullInt();
- bool justOne = flags & RemoveOption_JustOne;//值删除单条doc
- bool broadcast = flags & RemoveOption_Broadcast;
- verify( d.moreJSObjs() );
- BSONObj pattern = d.nextJsObj();
- op.debug().query = pattern;
- op.setQuery(pattern);
- PageFaultRetryableSection s;
- while ( 1 ) {
- try {
- Lock::DBWrite lk(ns);
- // writelock is used to synchronize stepdowns w/ writes
- uassert( 10056 , "not master", isMasterNs( ns ) );
- // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit
- if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) )
- return;
- Client::Context ctx(ns);//具体的删除函数
- long long n = deleteObjects(ns, pattern, justOne, true);
- lastError.getSafe()->recordDelete( n );
- break;
- }
- catch ( PageFaultException& e ) {
- LOG(2) << "recordDelete got a PageFaultException" << endl;
- e.touch();
- }
- }
- }
- long long deleteObjects(const char *ns, BSONObj pattern, bool justOne, bool logop, bool god, RemoveSaver * rs ) {
- long long nDeleted = 0;//根据查询条件得到游标
- shared_ptr< Cursor > creal = NamespaceDetailsTransient::getCursor( ns, pattern );
- if( !creal->ok() )
- return nDeleted;
- shared_ptr< Cursor > cPtr = creal;
- auto_ptr<ClientCursor> cc( new ClientCursor( QueryOption_NoCursorTimeout, cPtr, ns) );
- cc->setDoingDeletes( true );
- CursorId id = cc->cursorid();
- bool canYield = !god && !(creal->matcher() && creal->matcher()->docMatcher().atomic());
- do {
- // TODO: we can generalize this I believe
- //
- bool willNeedRecord = (creal->matcher() && creal->matcher()->needRecord()) || pattern.isEmpty() || isSimpleIdQuery( pattern );
- if ( ! willNeedRecord ) {
- // TODO: this is a total hack right now
- // check if the index full encompasses query
- if ( pattern.nFields() == 1 &&
- str::equals( pattern.firstElement().fieldName() , creal->indexKeyPattern().firstElement().fieldName() ) )
- willNeedRecord = true;
- }
- if ( canYield && ! cc->yieldSometimes( willNeedRecord ? ClientCursor::WillNeed : ClientCursor::MaybeCovered ) ) {
- cc.release(); // has already been deleted elsewhere
- // TODO should we assert or something?
- break;
- }
- if ( !cc->ok() ) {
- break; // if we yielded, could have hit the end
- }
- // this way we can avoid calling prepareToYield() every time (expensive)
- // as well as some other nuances handled
- cc->setDoingDeletes( true );
- DiskLoc rloc = cc->currLoc();
- BSONObj key = cc->currKey();
- bool match = creal->currentMatches();
- cc->advance();
- if ( ! match )
- continue;
- // SERVER-5198 Advance past the document to be modified, but see SERVER-5725.
- while( cc->ok() && rloc == cc->currLoc() ) {//多值索引就会出现这种状况,需要跳过同一条记录
- cc->advance();
- }
- bool foundAllResults = ( justOne || !cc->ok() );
- if ( !foundAllResults ) {
- // NOTE: Saving and restoring a btree cursor's position was historically described
- // as slow here.
- cc->c()->prepareToTouchEarlierIterate();
- }
- if ( logop ) {//记录删除动作
- BSONElement e;
- if( BSONObj::make( rloc.rec() ).getObjectID( e ) ) {
- BSONObjBuilder b;
- b.append( e );
- bool replJustOne = true;
- logOp( "d", ns, b.done(), 0, &replJustOne );
- }
- else {
- problem() << "deleted object without id, not logging" << endl;
- }
- }
- if ( rs )//将要删除的doc记录下来
- rs->goingToDelete( rloc.obj() /*cc->c->current()*/ );
- theDataFileMgr.deleteRecord(ns, rloc.rec(), rloc);//记录的删除
- nDeleted++;
- if ( foundAllResults ) {
- break;
- }
- cc->c()->recoverFromTouchingEarlierIterate();
- if( !god )
- getDur().commitIfNeeded();
- if( debug && god && nDeleted == 100 )
- log() << "warning high number of deletes with god=true which could use significant memory" << endl;
- }
- while ( cc->ok() );
- if ( cc.get() && ClientCursor::find( id , false ) == 0 ) {
- // TODO: remove this and the id declaration above if this doesn't trigger
- // if it does, then i'm very confused (ERH 06/2011)
- error() << "this should be impossible" << endl;
- printStackTrace();
- cc.release();
- }
- return nDeleted;
- }
receiveDelete->deleteObjects->deleteRecord
- void DataFileMgr::deleteRecord(const char *ns, Record *todelete, const DiskLoc& dl, bool cappedOK, bool noWarn, bool doLog ) {
- NamespaceDetails* d = nsdetails(ns);
- if ( d->isCapped() && !cappedOK ) {
- out() << "failing remove on a capped ns " << ns << endl;
- uassert( 10089 , "can't remove from a capped collection" , 0 );
- return;
- }
- BSONObj toDelete;
- if ( doLog ) {
- BSONElement e = dl.obj()["_id"];
- if ( e.type() ) {
- toDelete = e.wrap();
- }
- }
- /* check if any cursors point to us. if so, advance them. */
- ClientCursor::aboutToDelete(dl);
- unindexRecord(d, todelete, dl, noWarn);//删除索引,循环删除所有索引中记录todelete的部分.
- _deleteRecord(d, ns, todelete, dl);//实际的数据删除并将其空间添加到deletedList中
- NamespaceDetailsTransient::get( ns ).notifyOfWriteOp();
- if ( ! toDelete.isEmpty() ) {
- logOp( "d" , ns , toDelete );
- }
- }
原文链接: mongodb源码分析(十一)数据的删除
作者: yhjj0108,杨浩
相对于删除操作,更新操作复杂得多,因为其操作很多,mongodb提供了很多更新的操作符,另外还要考虑到更
新时如果原来的数据doc空间不够还得删除原来的doc再添加新的doc,相当于做了两次操作,这里的过程同样会影
响collection中所有的索引.下面来看代码吧,更新操作的入口为:
- void receivedUpdate(Message& m, CurOp& op) {
- DbMessage d(m);
- const char *ns = d.getns();
- op.debug().ns = ns;
- int flags = d.pullInt();
- BSONObj query = d.nextJsObj();
- BSONObj toupdate = d.nextJsObj();
- bool upsert = flags & UpdateOption_Upsert;
- bool multi = flags & UpdateOption_Multi;
- bool broadcast = flags & UpdateOption_Broadcast;
- op.debug().query = query;
- op.setQuery(query);
- PageFaultRetryableSection s;
- while ( 1 ) {
- try {
- Lock::DBWrite lk(ns);
- // void ReplSetImpl::relinquish() uses big write lock so
- // this is thus synchronized given our lock above.
- uassert( 10054 , "not master", isMasterNs( ns ) );//不是master不能执行更新
- // if this ever moves to outside of lock, need to adjust check Client::Context::_finishInit
- if ( ! broadcast && handlePossibleShardedMessage( m , 0 ) )
- return;
- Client::Context ctx( ns );//实际的更新部分
- UpdateResult res = updateObjects(ns, toupdate, query, upsert, multi, true, op.debug() );
- lastError.getSafe()->recordUpdate( res.existing , res.num , res.upserted ); // for getlasterror
- break;
- }
- catch ( PageFaultException& e ) {//要更新的数据不在内存,让操作系统加载其到内存
- e.touch();
- }
- }
- }
直接分析_updateObjects. receiveUpdate->updateObjects->_updateObjects.
- UpdateResult _updateObjects( bool su,const char* ns,const BSONObj& updateobj,const BSONObj& patternOrig,bool upsert,
- bool multi,bool logop ,OpDebug& debug,RemoveSaver* rs,bool fromMigrate,const QueryPlanSelectionPolicy& planPolicy ) {
- Client& client = cc();
- int profile = client.database()->profile;
- debug.updateobj = updateobj;
- // The idea with these here it to make them loop invariant for
- // multi updates, and thus be a bit faster for that case. The
- // pointers may be left invalid on a failed or terminal yield
- // recovery.
- NamespaceDetails* d = nsdetails(ns); // can be null if an upsert...
- NamespaceDetailsTransient* nsdt = &NamespaceDetailsTransient::get(ns);
- auto_ptr<ModSet> mods;
- bool isOperatorUpdate = updateobj.firstElementFieldName()[0] == '$';
- int modsIsIndexed = false; // really the # of indexes
- if ( isOperatorUpdate ) {//根据更新操作提取具体的更新符以及数据建立更新操作结构,比如update={$set:{a:1},$inc{b:1}}
- if( d && d->indexBuildInProgress ) {//那么建立两个mod,第一个操作符为$set,操作对象为{a:1},第二个操作符为$inc,
- set<string> bgKeys; //操作对象为{b:1}
- d->inProgIdx().keyPattern().getFieldNames(bgKeys);
- mods.reset( new ModSet(updateobj, nsdt->indexKeys(), &bgKeys) );
- }
- else {
- mods.reset( new ModSet(updateobj, nsdt->indexKeys()) );
- }
- modsIsIndexed = mods->isIndexed();//修改可能变更到索引,后面则需要同时更新索引才行
- }
- 单id查询的更新比如说pattern={_id:1},updateobj={$inc:{x:1}},并且这里x不为索引
- if( planPolicy.permitOptimalIdPlan() && !multi && isSimpleIdQuery(patternOrig) && d &&
- !modsIsIndexed ) {
- int idxNo = d->findIdIndex();
- if( idxNo >= 0 ) {
- debug.idhack = true;//updateById函数和后面的操作流程差不多,这里就不分析了,看后面的分析
- UpdateResult result = _updateById( isOperatorUpdate,idxNo,mods.get(),profile,d,nsdt,su,ns,
- updateobj,patternOrig,logop,debug,fromMigrate);
- if ( result.existing || ! upsert ) {
- return result;
- }
- else if ( upsert && ! isOperatorUpdate && ! logop) {
- // this handles repl inserts
- checkNoMods( updateobj );
- debug.upsert = true;
- BSONObj no = updateobj;
- theDataFileMgr.insertWithObjMod(ns, no, su);
- return UpdateResult( 0 , 0 , 1 , no );
- }
- }
- }
- int numModded = 0;
- debug.nscanned = 0;
- shared_ptr<Cursor> c =
- NamespaceDetailsTransient::getCursor( ns, patternOrig, BSONObj(), planPolicy );
- d = nsdetails(ns);
- nsdt = &NamespaceDetailsTransient::get(ns);
- bool autoDedup = c->autoDedup();
- if( c->ok() ) {
- set<DiskLoc> seenObjects;
- MatchDetails details;
- auto_ptr<ClientCursor> cc;
- do {
- if ( cc.get() == 0 &&//抛出异常在外捕获让系统将数据加载进内存
- client.allowedToThrowPageFaultException() &&
- ! c->currLoc().isNull() &&
- ! c->currLoc().rec()->likelyInPhysicalMemory() ) {
- throw PageFaultException( c->currLoc().rec() );
- }
- debug.nscanned++;
- if ( mods.get() && mods->hasDynamicArray() ) {
- // The Cursor must have a Matcher to record an elemMatchKey. But currently
- // a modifier on a dynamic array field may be applied even if there is no
- // elemMatchKey, so a matcher cannot be required.
- //verify( c->matcher() );
- details.requestElemMatchKey();
- }
- if ( !c->currentMatches( &details ) ) {
- c->advance();
- continue;
- }
- //match
- Record* r = c->_current();
- DiskLoc loc = c->currLoc();
- if ( c->getsetdup( loc ) && autoDedup ) {//首先找到的loc肯定c->getsetdup()返回false,第二次
- c->advance();//更新同样的内容时c->getsetdup返回true,这是由于muti index产生的
- continue;//所以第二次要跳过这一条记录,这种情况不会出现于basicCursor,因为其总是返回false
- }
- BSONObj js = BSONObj::make(r);
- BSONObj pattern = patternOrig;
- /* look for $inc etc. note as listed here, all fields to inc must be this type, you can't set some
- regular ones at the moment. */
- if ( isOperatorUpdate ) {
- if ( multi ) {
- // go to next record in case this one moves
- c->advance();
- // Update operations are deduped for cursors that implement their own
- // deduplication. In particular, some geo cursors are excluded.
- if ( autoDedup ) {
- if ( seenObjects.count( loc ) ) {//判断这个loc是否已经存在,第一次肯定是
- continue;//不存在的,所以总是返回false
- }
- // SERVER-5198 Advance past the document to be modified, provided
- // deduplication is enabled, but see SERVER-5725.
- while( c->ok() && loc == c->currLoc() ) {//如果遍历过程中始终指向了
- c->advance();//该loc,则一直advance,直到跳过该loc
- }
- }
- }
- const BSONObj& onDisk = loc.obj();
- ModSet* useMods = mods.get();
- bool forceRewrite = false;
- //需要说明的是这里的".$"符号只是替换为match时array当时的位置
- //所以这里有一个问题,如果我有一条数据为{x:[1,2,3,4,5],y:[6,7,8,9,10]}
- //当我调用update({x:1,y:9},{$inc:{x.$:1}},false,false)本意是更新x的第一个位置
- //数据,但是实际上得到的结果是{x:[1,2,3,5,5],y:[6,7,8,9,10]},这算一个bug吗???
- auto_ptr<ModSet> mymodset;//这里是存在着".$"符号
- if ( details.hasElemMatchKey() && mods->hasDynamicArray() ) {
- useMods = mods->fixDynamicArray( details.elemMatchKey() );//这里的elemMatchKey是在查询匹配时
- mymodset.reset( useMods );//设置的,只是简单的将.$替换为elelMatchKey设置的位置
- forceRewrite = true;
- }//建立ModSetState,并且设置每一位更新数据是否可以通过简单替换来达到更新数据的目的如{$set:{a:10}}
- auto_ptr<ModSetState> mss = useMods->prepare( onDisk );//若a本来存在只需要替换其值为10就行了,不涉及到多占内存的情况
- bool willAdvanceCursor = multi && c->ok() && ( modsIsIndexed || ! mss->canApplyInPlace() );
- if ( willAdvanceCursor ) {
- if ( cc.get() ) {
- cc->setDoingDeletes( true );
- }
- c->prepareToTouchEarlierIterate();
- }
- if ( modsIsIndexed <= 0 && mss->canApplyInPlace() ) {//可以直接替换,如上面举的例子
- mss->applyModsInPlace( true );// const_cast<BSONObj&>(onDisk) );
- if ( profile && !multi )
- debug.fastmod = true;
- if ( modsIsIndexed ) {
- seenObjects.insert( loc );
- }
- d->paddingFits();
- }
- else {
- if ( rs )
- rs->goingToDelete( onDisk );//将要修改的部分转存到磁盘上
- BSONObj newObj = mss->createNewFromMods();//要更新的obj不能通过简单的替换某个值来达到更新目的,必须得
- checkTooLarge(newObj);//根据更新值和原来的值重新创建一个新的对象
- DiskLoc newLoc = theDataFileMgr.updateRecord(ns,d,nsdt,r,loc,newObj.objdata(),newObj.objsize(),debug);
- if ( newLoc != loc || modsIsIndexed ){
- // log() << "Moved obj " << newLoc.obj()["_id"] << " from " << loc << " to " << newLoc << endl;
- // object moved, need to make sure we don' get again
- seenObjects.insert( newLoc );
- }
- }
- numModded++;
- if ( ! multi )
- return UpdateResult( 1 , 1 , numModded , BSONObj() );
- if ( willAdvanceCursor )
- c->recoverFromTouchingEarlierIterate();
- getDur().commitIfNeeded();
- continue;
- }
- BSONElementManipulator::lookForTimestamps( updateobj );
- checkNoMods( updateobj );//无操作符数据的更新
- theDataFileMgr.updateRecord(ns, d, nsdt, r, loc , updateobj.objdata(), updateobj.objsize(), debug, su);
- return UpdateResult( 1 , 0 , 1 , BSONObj() );
- } while ( c->ok() );
- } // endif
- if ( numModded )
- return UpdateResult( 1 , 1 , numModded , BSONObj() );
- if ( upsert ) {//这里到了插入了,说明查询没查到数据,进入插入操作
- if ( updateobj.firstElementFieldName()[0] == '$' ) {
- // upsert of an $operation. build a default object
- BSONObj newObj = mods->createNewFromQuery( patternOrig );//根据查询语句创建一个obj
- checkNoMods( newObj );
- debug.fastmodinsert = true;
- theDataFileMgr.insertWithObjMod(ns, newObj, su);
- return UpdateResult( 0 , 1 , 1 , newObj );
- }
- checkNoMods( updateobj );
- debug.upsert = true;
- BSONObj no = updateobj;
- theDataFileMgr.insertWithObjMod(ns, no, su);
- return UpdateResult( 0 , 0 , 1 , no );
- }
- return UpdateResult( 0 , isOperatorUpdate , 0 , BSONObj() );
- }
那么就执行简单的替换,否则就需要调用createNewFromMods函数创建出一个新的对象,并调用
updateRecord去更新实际的数据.下面来看看createNewFromMods函数.
- BSONObj ModSetState::createNewFromMods() {
- BSONObjBuilder b( (int)(_obj.objsize() * 1.1) );//很可能需要更多存储空间,初始化更大的空间
- createNewObjFromMods( "" , b , _obj );
- return _newFromMods = b.obj();
- }
- void ModSetState::createNewObjFromMods( const string& root,
- BSONObjBuilder& builder,
- const BSONObj& obj ) {
- BSONObjIteratorSorted es( obj );
- createNewFromMods( root, builder, es, modsForRoot( root ), LexNumCmp( true ) );
- }
- void ModSetState::createNewFromMods( const string& root,
- BSONBuilderBase& builder,
- BSONIteratorSorted& es,
- const ModStateRange& modRange,
- const LexNumCmp& lexNumCmp ) {
- ModStateHolder::iterator m = modRange.first;
- const ModStateHolder::const_iterator mend = modRange.second;
- BSONElement e = es.next();
- set<string> onedownseen;
- BSONElement prevE;
- while ( !e.eoo() && m != mend ) {
- if ( duplicateFieldName( prevE, e ) ) {//前一个没有匹配,说明这个域没有被更改到,直接添加到新的对象上
- // Just copy through an element with a duplicate field name.
- builder.append( e );
- prevE = e;
- e = es.next();
- continue;
- }
- prevE = e;
- string field = root + e.fieldName();
- FieldCompareResult cmp = compareDottedFieldNames( m->second->m->fieldName , field ,
- lexNumCmp );
- switch ( cmp ) {
- case LEFT_SUBFIELD: { // Mod is embedded under this element
- uassert( 10145,//如这种情况m->fieldName=a.b.c,field=a.b
- str::stream() << "LEFT_SUBFIELD only supports Object: " << field
- << " not: " << e.type() , e.type() == Object || e.type() == Array );
- if ( onedownseen.count( e.fieldName() ) == 0 ) {
- onedownseen.insert( e.fieldName() );
- if ( e.type() == Object ) {//深入到下一层创建更深层的对象
- BSONObjBuilder bb( builder.subobjStart( e.fieldName() ) );
- stringstream nr; nr << root << e.fieldName() << ".";
- createNewObjFromMods( nr.str() , bb , e.Obj() );
- bb.done();
- }
- else {
- BSONArrayBuilder ba( builder.subarrayStart( e.fieldName() ) );
- stringstream nr; nr << root << e.fieldName() << ".";
- createNewArrayFromMods( nr.str() , ba , BSONArray( e.embeddedObject() ) );
- ba.done();
- }
- // inc both as we handled both
- e = es.next();
- m++;
- }
- else {
- massert( 16069 , "ModSet::createNewFromMods - "
- "SERVER-4777 unhandled duplicate field" , 0 );
- }
- continue;
- }
- case LEFT_BEFORE: // Mod on a field that doesn't exist
- _appendNewFromMods( root , *m->second , builder , onedownseen );
- m++;
- continue;
- case SAME://要更新的域和原本的域是一样的,则直接应用更新,这里添加了更新后的数据
- m->second->apply( builder , e );
- e = es.next();
- m++;
- continue;
- case RIGHT_BEFORE: // field that doesn't have a MOD
- builder.append( e ); // if array, ignore field name
- e = es.next();
- continue;
- case RIGHT_SUBFIELD://比如说e现在是a.b.c,但是m是a,这种情况是不能出现的
- massert( 10399 , "ModSet::createNewFromMods - RIGHT_SUBFIELD should be impossible" , 0 );
- break;
- default:
- massert( 10400 , "unhandled case" , 0 );
- }
- }
- // finished looping the mods, just adding the rest of the elements
- while ( !e.eoo() ) {
- builder.append( e ); // if array, ignore field name
- e = es.next();
- }
- // do mods that don't have fields already
- for ( ; m != mend; m++ ) {//最后添加要更新的域的数据
- _appendNewFromMods( root , *m->second , builder , onedownseen );
- }
- }
以及比较操作所以需要多多调试才能够体会.最后来看看updateRecord函数.
- const DiskLoc DataFileMgr::updateRecord(
- const char *ns,
- NamespaceDetails *d,
- NamespaceDetailsTransient *nsdt,
- Record *toupdate, const DiskLoc& dl,
- const char *_buf, int _len, OpDebug& debug, bool god) {
- BSONObj objOld = BSONObj::make(toupdate);
- BSONObj objNew(_buf);
- if( !objNew.hasElement("_id") && objOld.hasElement("_id") ) {//添加一个_id
- /* add back the old _id value if the update removes it. Note this implementation is slow
- (copies entire object multiple times), but this shouldn't happen often, so going for simple
- code, not speed.
- */
- BSONObjBuilder b;
- BSONElement e;
- verify( objOld.getObjectID(e) );
- b.append(e); // put _id first, for best performance
- b.appendElements(objNew);
- objNew = b.obj();
- }
- /* duplicate key check. we descend the btree twice - once for this check, and once for the actual inserts, further
- below. that is suboptimal, but it's pretty complicated to do it the other way without rollbacks...
- */
- vector<IndexChanges> changes;
- bool changedId = false;//遍历index得到所有的index变化
- getIndexChanges(changes, ns, *d, objNew, objOld, changedId);
- dupCheck(changes, *d, dl);
- if ( toupdate->netLength() < objNew.objsize() ) {//数据长度不够,删除其,然后再插入,这里的数据长度包括了old record
- //本来的数据长度以及后面留下的padding
- // doesn't fit. reallocate -----------------------------------------------------
- uassert( 10003 , "failing update: objects in a capped ns cannot grow", !(d && d->isCapped()));
- d->paddingTooSmall();//数据长度不够后调整paddingFit,让下次分配更改的内存避免修改时长度再次不够
- deleteRecord(ns, toupdate, dl);
- DiskLoc res = insert(ns, objNew.objdata(), objNew.objsize(), god);
- if (debug.nmoved == -1) // default of -1 rather than 0
- debug.nmoved = 1;
- else
- debug.nmoved += 1;
- return res;
- }
- nsdt->notifyOfWriteOp();
- d->paddingFits();
- /* have any index keys changed? */
- {
- int keyUpdates = 0;//下面数据长度够,所以先删除过时的index和添加新的index
- int z = d->nIndexesBeingBuilt();//最后复制数据
- for ( int x = 0; x < z; x++ ) {
- IndexDetails& idx = d->idx(x);
- IndexInterface& ii = idx.idxInterface();
- for ( unsigned i = 0; i < changes[x].removed.size(); i++ ) {
- bool found = ii.unindex(idx.head, idx, *changes[x].removed[i], dl);
- if ( ! found ) {
- RARELY warning() << "ns: " << ns << " couldn't unindex key: " << *changes[x].removed[i]
- << " for doc: " << objOld["_id"] << endl;
- }
- }
- BSONObj idxKey = idx.info.obj().getObjectField("key");
- Ordering ordering = Ordering::make(idxKey);
- keyUpdates += changes[x].added.size();
- for ( unsigned i = 0; i < changes[x].added.size(); i++ ) {
- /* we did the dupCheck() above. so we don't have to worry about it here. */
- ii.bt_insert(
- idx.head,
- dl, *changes[x].added[i], ordering, /*dupsAllowed*/true, idx);
- }
- }
- debug.keyUpdates = keyUpdates;
- }
- // update in place
- int sz = objNew.objsize();//实际对象的数据复制
- memcpy(getDur().writingPtr(toupdate->data(), sz), objNew.objdata(), sz);
- return dl;
- }
从update的流程来看整个过程不复杂,但是其中细节很多,需要多多调试才能够明白,其中貌似有个bug
代码中以说明.需要记住的是如果更新后数据的长度超过了原doc能够存储的数据长度,那么更新变更为一次
数据的删除以及一次数据的插入操作.
本文链接: mongodb源码分析(十二)数据的更新
作者: yhjj0108,杨浩