mongodb源码分析(二十三)mongos chunk的迁移

最新推荐文章于 2022-12-28 16:48:02 发布

yhjj0108

最新推荐文章于 2022-12-28 16:48:02 发布

阅读量3.7k

点赞数 1

分类专栏： mongodb源码分析

本文链接：https://blog.csdn.net/yhjj0108/article/details/8440821

版权

mongodb源码分析专栏收录该内容

24 篇文章 3 订阅

订阅专栏

本文我们分析一个chunk的迁移，下文中将分析mongodb的shard平衡策略,之所以分开成两篇文章分析是因为chunk的偏移设计命令太多，太长.下面首先来看看chunk的迁移流程.

1. 将要迁移chunk端A首先记录chunk迁移数据的位置.

2. 通知远端B,让其执行_recvChunkStart开始chunk的迁移.

3. B端首先从A端system.indexes读取索引,并将其插入到自身上.

4. B端读取A端数据,并插入到自己的collection.

5. B端执行在从A端读取数据时A端产生的删除,插入操作.

6. A端在B端读取数据时一直向B端询问是否已经操作完毕准备提交了.

7. B端通知自己已经ready等待提交数据.

8. A端通知B端提交数据.

9. B端提交数据.

10. A端更新configserver配置数据,更新自己的chunkmanager.

11. A端清空自己记录的迁移数据位置,清空已经移动到了B端的数据.

下面进入代码分析,chunk的迁移时movechunk命令完成的,这个命令是在迁移chunk的服务器上运行的,其代码非常长,这里分成几段来讲解.

bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
    // 1. parse options
    // 2. make sure my view is complete and lock
    // 3. start migrate
    //    in a read lock, get all DiskLoc and sort so we can do as little seeking as possible
    //    tell to start transferring
    // 4. pause till migrate caught up
    // 5. LOCK
    //    a) update my config, essentially locking
    //    b) finish migrate
    //    c) update config server
    //    d) logChange to config server
    // 6. wait for all current cursors to expire
    // 7. remove data locally
    // 1.参数检测部分
    string ns = cmdObj.firstElement().str();
    string to = cmdObj["to"].str();
    string from = cmdObj["from"].str(); // my public address, a tad redundant, but safe
    // if we do a w=2 after very write
    bool secondaryThrottle = cmdObj["secondaryThrottle"].trueValue();
    if ( secondaryThrottle && ! anyReplEnabled() )
        secondaryThrottle = false;
    BSONObj min  = cmdObj["min"].Obj();
    BSONObj max  = cmdObj["max"].Obj();
    BSONElement shardId = cmdObj["shardId"];
    BSONElement maxSizeElem = cmdObj["maxChunkSizeBytes"];
    const long long maxChunkSize = maxSizeElem.numberLong(); // in bytes
    if ( ! shardingState.enabled() ) {
        string configdb = cmdObj["configdb"].String();
        shardingState.enable( configdb );
        configServer.init( configdb );
    }
    MoveTimingHelper timing( "from" , ns , min , max , 6 /* steps */ , errmsg );
    // Make sure we're as up-to-date as possible with shard information
    // This catches the case where we had to previously changed a shard's host by
    // removing/adding a shard with the same name
    Shard::reloadShardInfo();
    // So 2.2 mongod can interact with 2.0 mongos, mongod needs to handle either a conn
    // string or a shard in the to/from fields.  The Shard constructor handles this,
    // eventually we should break the compatibility.
    Shard fromShard( from );
    Shard toShard( to );
    timing.done(1);
    // 2.
    //分布式锁锁ns命令的collection
    DistributedLock lockSetup( ConnectionString( shardingState.getConfigServer() , ConnectionString::SYNC ) , ns );
    dist_lock_try dlk;
    dlk = dist_lock_try( &lockSetup , (string)"migrate-" + min.toString() );
    dlk.got();
    BSONObj chunkInfo = BSON("min" << min << "max" << max << "from" << fromShard.getName() << "to" << toShard.getName() );
    configServer.logChange( "moveChunk.start" , ns , chunkInfo );
    ShardChunkVersion maxVersion;
    string myOldShard;
    {
        scoped_ptr<ScopedDbConnection> conn(ScopedDbConnection::getInternalScopedDbConnection(shardingState.getConfigServer()) );
        BSONObj x;
        BSONObj currChunk;
        x = conn->get()->findOne( ShardNS::chunk,Query( BSON( "ns" << ns ) ).sort( BSON( "lastmod" << -1 ) ) );
        currChunk = conn->get()->findOne( ShardNS::chunk , shardId.wrap( "_id" ) );//currChunk为当前要move的chunk
        maxVersion = ShardChunkVersion::fromBSON( x, "lastmod" );
        myOldShard = currChunk["shard"].String();
        conn->done();
        BSONObj currMin = currChunk["min"].Obj();
        BSONObj currMax = currChunk["max"].Obj();
        if ( myOldShard != fromShard.getName() )
            return false;
        if ( maxVersion < shardingState.getVersion( ns ) )
            return false;
        // since this could be the first call that enable sharding we also make sure to have the chunk manager up to date
        shardingState.gotShardName( myOldShard );
        // Using the maxVersion we just found will enforce a check - if we use zero version,
        // it's possible this shard will be *at* zero version from a previous migrate and
        // no refresh will be done
        // TODO: Make this less fragile
        ShardChunkVersion shardVersion = maxVersion;//更新版本信息,加载chunkmanager
        shardingState.trySetVersion( ns , shardVersion /* will return updated */ );
    }
    timing.done(2);
    // 3.
    ShardChunkManagerPtr chunkManager = shardingState.getShardChunkManager( ns );
    BSONObj shardKeyPattern = chunkManager->getKey();
    MigrateStatusHolder statusHolder( ns , min , max , shardKeyPattern );
    {
        // this gets a read lock, so we know we have a checkpoint for mods
	//这里是存储当前这个chunk的数据的地址,后面方便来自B端数据的读取操作,记录地址使用的是一个set,因为需要排序.
        if ( ! migrateFromStatus.storeCurrentLocs( maxChunkSize , errmsg , result ) )
            return false;
        scoped_ptr<ScopedDbConnection> connTo(
                ScopedDbConnection::getScopedDbConnection( toShard.getConnString() ) );
        BSONObj res;
        bool ok;
        ok = connTo->get()->runCommand( "admin" ,//这里是通知B端开始chunk的迁移.
                                            BSON( "_recvChunkStart" << ns <<
                                                  "from" << fromShard.getConnString() <<
                                                  "min" << min <<
                                                  "max" << max <<
                                                  "shardKeyPattern" << shardKeyPattern <<
                                                  "configServer" << configServer.modelServer() <<
                                                  "secondaryThrottle" << secondaryThrottle
                                                  ) ,
                                            res );
        connTo->done();
    }

下面继续来看这里的函数_recvChunkStart.

bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
    if ( migrateStatus.getActive() ) {//一个chunk迁移已经开始了
        errmsg = "migrate already in progress";
        return false;
    }
    if ( ! configServer.ok() )
        configServer.init( cmdObj["configServer"].String() );
    migrateStatus.prepare();
    migrateStatus.ns = cmdObj.firstElement().String();
    migrateStatus.from = cmdObj["from"].String();
    migrateStatus.min = cmdObj["min"].Obj().getOwned();
    migrateStatus.max = cmdObj["max"].Obj().getOwned();
    migrateStatus.shardKeyPattern = cmdObj["shardKeyPattern"].Obj().getOwned();
    migrateStatus.secondaryThrottle = cmdObj["secondaryThrottle"].trueValue();
    if ( migrateStatus.secondaryThrottle && ! anyReplEnabled() )
        migrateStatus.secondaryThrottle = false;
    boost::thread m( migrateThread );//开启一个线程专门负责迁移工作
    result.appendBool( "started" , true );
    return true;
}

下面来看这个线程migrateThread.其调用migrateStatus.go做迁移工作,而这个go函数其实是_go的包装,下面直接从_go开始分析.

void _go() {
    slaveCount = ( getSlaveCount() / 2 ) + 1;
    scoped_ptr<ScopedDbConnection> connPtr(ScopedDbConnection::getScopedDbConnection( from ) );
    ScopedDbConnection& conn = *connPtr;//建立来自from的连接
    conn->getLastError(); // just test connection
    {
        // 0. copy system.namespaces entry if collection doesn't already exist
        Client::WriteContext ctx( ns );
        // Only copy if ns doesn't already exist
        if ( ! nsdetails( ns.c_str() ) ) {//本地collection不存在则建立一个
            string system_namespaces = NamespaceString( ns ).db + ".system.namespaces";
            BSONObj entry = conn->findOne( system_namespaces, BSON( "name" << ns ) );
            if ( entry["options"].isABSONObj() ) {
                string errmsg;
                if ( ! userCreateNS( ns.c_str(), entry["options"].Obj(), errmsg, true, 0 ) )
                    warning() << "failed to create collection with options: " << errmsg
                              << endl;
            }
        }
    }
    {                
        // 1. copy indexes   
        vector<BSONObj> all;
        {  //建立关于这个collection的所有索引
            auto_ptr<DBClientCursor> indexes = conn->getIndexes( ns );
            while ( indexes->more() ) {
                all.push_back( indexes->next().getOwned() );
            }
        }//向.system.indexes插入一条数据将自动建立相应的索引,这个可详见插入数据部分
        for ( unsigned i=0; i<all.size(); i++ ) {
            BSONObj idx = all[i];
            Client::WriteContext ct( ns );
            string system_indexes = cc().database()->name + ".system.indexes";
            theDataFileMgr.insertAndLog( system_indexes.c_str() , idx, true /* flag fromMigrate in oplog */ );
        }
    }
    {
        // 2. delete any data already in range
        RemoveSaver rs( "moveChunk" , ns , "preCleanup" );
        long long num = Helpers::removeRange( ns ,
                                              min ,
                                              max ,
                                              findShardKeyIndexPattern_unlocked( ns , shardKeyPattern ) , 
                                              false , /*maxInclusive*/
                                              secondaryThrottle , /* secondaryThrottle */
                                              cmdLine.moveParanoia ? &rs : 0 , /*callback*/
                                              true ); /* flag fromMigrate in oplog */
    }
    {
        // 3. initial bulk clone
        state = CLONE;
        while ( true ) {//从from端克隆数据,复制到自己的数据库里
            BSONObj res;//这里从A端读取数据
            if ( ! conn->runCommand( "admin" , BSON( "_migrateClone" << 1 ) , res ) ) {  // gets array of objects to copy, in disk order
                state = FAIL;
                conn.done();
                return;
            }//实际的数据
            BSONObj arr = res["objects"].Obj();
            int thisTime = 0;
            BSONObjIterator i( arr );
            while( i.more() ) {
                BSONObj o = i.next().Obj();
                {
                    PageFaultRetryableSection pgrs;
                    while ( 1 ) {
                        try {
                            Lock::DBWrite lk( ns );
                            Helpers::upsert( ns, o, true );//数据插入本地
                            break;
                        }
                        catch ( PageFaultException& e ) {
                            e.touch();
                        }
                    }
                }
                thisTime++;
                numCloned++;
                clonedBytes += o.objsize();
                if ( secondaryThrottle ) {//设定了这个参数时需要等待至少两个secondary端插入了数据,才能继续
                    if ( ! waitForReplication( cc().getLastOp(), 2, 60 /* seconds to wait */ ) ) {
                    }
                }
            }
            if ( thisTime == 0 )
                break;
        }
    }
    // if running on a replicated system, we'll need to flush the docs we cloned to the secondaries
    ReplTime lastOpApplied = cc().getLastOp().asDate();
    {
        // 4. do bulk of mods
        state = CATCHUP;//之前从from复制了数据,但是复制数据期间可能这个chunk的数据
        while ( true ) {//被更改了,所以这里从from端传递更改的信息过来,然后应用到本地
            BSONObj res;//更改信息中
            if ( ! conn->runCommand( "admin" , BSON( "_transferMods" << 1 ) , res ) ) {
                state = FAIL;
                conn.done();
                return;
            }
            if ( res["size"].number() == 0 )
                break;
            apply( res , &lastOpApplied );//应用来自A端的修改操作
            const int maxIterations = 3600*50;
            int i;
            for ( i=0;i<maxIterations; i++) {
                if ( state == ABORT ) {
                    timing.note( "aborted" );
                    return;
                }
                if ( opReplicatedEnough( lastOpApplied ) )
                    break;                
                sleepmillis( 20 );
            }
            if ( i == maxIterations ) {
                conn.done();
                state = FAIL;
                return;
            } 
        }
    }
    { 
        // pause to wait for replication
        // this will prevent us from going into critical section until we're ready
        Timer t;
        while ( t.minutes() < 600 ) {
            if ( flushPendingWrites( lastOpApplied ) )
                break;
            sleepsecs(1);
        }
    }

    {
        // 5. wait for commit
        state = STEADY;//等待来自A端的命令,然后提交数据,等待的同时需要不断的replayA端的修改操作
        while ( state == STEADY || state == COMMIT_START ) {
            BSONObj res;
            if ( ! conn->runCommand( "admin" , BSON( "_transferMods" << 1 ) , res ) ) {
                state = FAIL;
                conn.done();
                return;
            }
            if ( res["size"].number() > 0 && apply( res , &lastOpApplied ) )
                continue;
            if ( state == ABORT ) {
                timing.note( "aborted" );
                return;
            }
            if ( state == COMMIT_START ) {
                if ( flushPendingWrites( lastOpApplied ) )
                    break;
            }
            sleepmillis( 10 );
        }
        if ( state == FAIL ) {
            return;
        }
        timing.done(5);
    }
    state = DONE;
    conn.done();
}

下面我们回到A端首先看看_migrateClone,这个命令是B端从A端读取数据时执行的.执行的函数为MigrateFromStatus::clone.

bool clone( string& errmsg , BSONObjBuilder& result ) {
    if ( ! _getActive() ) {//必须是chunk迁移已经启动
        return false;
    }
    ElapsedTracker tracker (128, 10); // same as ClientCursor::_yieldSometimesTracker
    int allocSize;
    {
        Client::ReadContext ctx( _ns );
        NamespaceDetails *d = nsdetails( _ns.c_str() );
        scoped_spinlock lk( _trackerLocks );
        allocSize = std::min(BSONObjMaxUserSize, (int)((12 + d->averageObjectSize()) * _cloneLocs.size()));
    }
    BSONArrayBuilder a (allocSize);
    while ( 1 ) {
        bool filledBuffer = false;
        auto_ptr<LockMongoFilesShared> fileLock;
        Record* recordToTouch = 0;
        {
            Client::ReadContext ctx( _ns );
            scoped_spinlock lk( _trackerLocks );
			//之前记录的chunk数据的地址.
            set<DiskLoc>::iterator i = _cloneLocs.begin();
            for ( ; i!=_cloneLocs.end(); ++i ) {
                if (tracker.intervalHasElapsed()) // should I yield?
                    break;
                DiskLoc dl = *i;//实际数据的加载
                Record* r = dl.rec();
                if ( ! r->likelyInPhysicalMemory() ) {
                    fileLock.reset( new LockMongoFilesShared() );
                    recordToTouch = r;
                    break;
                }
                BSONObj o = dl.obj();
                // use the builder size instead of accumulating 'o's size so that we take into consideration
                // the overhead of BSONArray indices
                if ( a.len() + o.objsize() + 1024 > BSONObjMaxUserSize ) {
                    filledBuffer = true; // break out of outer while loop
                    break;
                }
                a.append( o );//实际数据的复制
            }//已经复制了的删除
            _cloneLocs.erase( _cloneLocs.begin() , i );
            if ( _cloneLocs.empty() || filledBuffer )
                break;
        }
        if ( recordToTouch ) {
            // its safe to touch here bceause we have a LockMongoFilesShared
            // we can't do where we get the lock because we would have to unlock the main readlock and tne _trackerLocks
            // simpler to handle this out there
            recordToTouch->touch();
            recordToTouch = 0;
        }
    }//这里返回的数据将被发往B端.
    result.appendArray( "objects" , a.arr() );
    return true;
}

继续来看命令_transferMods,它负责将B端读取数据时A端的修改日志穿到B端.这里调用的函数为:MigrateFromStatus::transferMods

bool transferMods( string& errmsg , BSONObjBuilder& b ) {
    if ( ! _getActive() )
        return false;
    long long size = 0;
    Client::ReadContext cx( _ns );//将deleted和reload的操作日志发送给B端
    xfer( &_deleted , b , "deleted" , size , false );
    xfer( &_reload , b , "reload" , size , true );
    b.append( "size" , size );
    return true;
}
void xfer( list<BSONObj> * l , BSONObjBuilder& b , const char * name , long long& size , bool explode ) {
    const long long maxSize = 1024 * 1024;
    if ( l->size() == 0 || size > maxSize )
        return;
    BSONArrayBuilder arr(b.subarrayStart(name));
    list<BSONObj>::iterator i = l->begin();
    while ( i != l->end() && size < maxSize ) {
        BSONObj t = *i;
        if ( explode ) {
            BSONObj it;//再次查看是否又有修改
            if ( Helpers::findById( cc() , _ns.c_str() , t, it ) ) {
                arr.append( it );
                size += it.objsize();
            }
        }
        else 
            arr.append( t );
        i = l->erase( i );
        size += t.objsize();
    }
    arr.done();
}

那么现在继续看看这里的_deleted和_reload的出处.

void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt, bool *b, bool fromMigrate) {
    if ( replSettings.master ) 
        _logOp(opstr, ns, 0, obj, patt, b, fromMigrate);
    logOpForSharding( opstr , ns , obj , patt );
}
void logOpForSharding( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt ) {
    migrateFromStatus.logOp( opstr , ns , obj , patt );
}
void logOp( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt ) {
    if ( ! _getActive() )//chunk迁移时记录删除以及插入等动作,以达到同步的目的
        return;
    if ( _ns != ns )
        return;
    // no need to log if this is not an insertion, an update, or an actual deletion
    // note: opstr 'db' isn't a deletion but a mention that a database exists (for replication
    // machinery mostly)
    char op = opstr[0];
    if ( op == 'n' || op =='c' || ( op == 'd' && opstr[1] == 'b' ) )
        return;
    BSONElement ide;
    if ( patt )
        ide = patt->getField( "_id" );
    else
        ide = obj["_id"];
    BSONObj it;
    switch ( opstr[0] ) {
    case 'd': {
        if ( getThreadName() == cleanUpThreadName ) {
            // we don't want to xfer things we're cleaning
            // as then they'll be deleted on TO
            // which is bad
            return;
        }
        // can't filter deletes :(
        _deleted.push_back( ide.wrap() );
        _memoryUsed += ide.size() + 5;
        return;
    }
    case 'i':
        it = obj;
        break;
    case 'u':
        if ( ! Helpers::findById( cc() , _ns.c_str() , ide.wrap() , it ) ) {
            return;
        }
        break;
    }
    if ( ! isInRange( it , _min , _max ) )
        return;
    _reload.push_back( ide.wrap() );
    _memoryUsed += ide.size() + 5;
}

下面我们回到A端movechunk命令上来:

    // 4.
    for ( int i=0; i<86400; i++ ) { // don't want a single chunk move to take more than a day
        sleepsecs( 1 );
        scoped_ptr<ScopedDbConnection> conn(
                ScopedDbConnection::getScopedDbConnection( toShard.getConnString() ) );
        BSONObj res;
        bool ok;
        //查看接收端数据接收信息,是否已经完成迁移等待提交了
        ok = conn->get()->runCommand( "admin" , BSON( "_recvChunkStatus" << 1 ) , res );
        res = res.getOwned();
        conn->done();
        if ( ! ok || res["state"].String() == "fail" ) {
            result.append( "cause" , res );
            return false;
        }
        if ( res["state"].String() == "steady" )//等待ready状态
            break;//迁移用内存太多,告知B端终止动作
        if ( migrateFromStatus.mbUsed() > (500 * 1024 * 1024) ) {
            // this is too much memory for us to use for this
            // so we're going to abort the migrate
            scoped_ptr<ScopedDbConnection> conn(ScopedDbConnection::getScopedDbConnection( toShard.getConnString() ) );
            BSONObj res;
            conn->get()->runCommand( "admin" , BSON( "_recvChunkAbort" << 1 ) , res );
            res = res.getOwned();
            conn->done();
            result.appendBool( "split" , true );
            return false;
        }
        killCurrentOp.checkForInterrupt();
    }
    // 5.
    {//真正的提交部分
        // 5.a
        // we're under the collection lock here, so no other migrate can change maxVersion or ShardChunkManager state
        migrateFromStatus.setInCriticalSection( true );
        ShardChunkVersion currVersion = maxVersion;
        ShardChunkVersion myVersion = currVersion;
        myVersion.incMajor();//chunk的迁移造成了主version增加1
        {
            Lock::DBWrite lk( ns );
            verify( myVersion > shardingState.getVersion( ns ) );
            // bump the chunks manager's version up and "forget" about the chunk being moved
            // this is not the commit point but in practice the state in this shard won't until the commit it done
            shardingState.donateChunk( ns , min , max , myVersion );//本地chunkmanager移出这个chunk
        }
        // 5.b
        // we're under the collection lock here, too, so we can undo the chunk donation because no other state change
        // could be ongoing
        {
            BSONObj res;
            scoped_ptr<ScopedDbConnection> connTo(ScopedDbConnection::getScopedDbConnection( toShard.getConnString() ) );
            bool ok;//告知B端提交数据
            ok = connTo->get()->runCommand( "admin" ,BSON( "_recvChunkCommit" << 1 ) ,res );
            connTo->done();
            if ( ! ok ) {//数据提交失败,这里将之前移出的那个chunk添加回去
                Lock::DBWrite lk( ns );
                // revert the chunk manager back to the state before "forgetting" about the chunk
                shardingState.undoDonateChunk( ns , min , max , currVersion );
                result.append( "cause" , res );
                return false;
            }
        }
        // 5.c
        // version at which the next highest lastmod will be set
        // if the chunk being moved is the last in the shard, nextVersion is that chunk's lastmod
        // otherwise the highest version is from the chunk being bumped on the FROM-shard
        ShardChunkVersion nextVersion;
        // we want to go only once to the configDB but perhaps change two chunks, the one being migrated and another
        // local one (so to bump version for the entire shard)
        // we use the 'applyOps' mechanism to group the two updates and make them safer
        // TODO pull config update code to a module
        BSONObjBuilder cmdBuilder;
        //更新configserver chunks信息,新产出了一个chunk当然需要修改chunks这个collection了
        BSONArrayBuilder updates( cmdBuilder.subarrayStart( "applyOps" ) );
        {
            // update for the chunk being moved
            BSONObjBuilder op;
            op.append( "op" , "u" );
            op.appendBool( "b" , false /* no upserting */ );
            op.append( "ns" , ShardNS::chunk );
            BSONObjBuilder n( op.subobjStart( "o" ) );
            n.append( "_id" , Chunk::genID( ns , min ) );
            myVersion.addToBSON( n, "lastmod" );
            n.append( "ns" , ns );
            n.append( "min" , min );
            n.append( "max" , max );
            n.append( "shard" , toShard.getName() );
            n.done();
            BSONObjBuilder q( op.subobjStart( "o2" ) );
            q.append( "_id" , Chunk::genID( ns , min ) );
            q.done();
            updates.append( op.obj() );
        }
        nextVersion = myVersion;
        // if we have chunks left on the FROM shard, update the version of one of them as well
        // we can figure that out by grabbing the chunkManager installed on 5.a
        // TODO expose that manager when installing it
        ShardChunkManagerPtr chunkManager = shardingState.getShardChunkManager( ns );
        if( chunkManager->getNumChunks() > 0 ) {
            // get another chunk on that shard
            BSONObj lookupKey;
            BSONObj bumpMin, bumpMax;
            do {
                chunkManager->getNextChunk( lookupKey , &bumpMin , &bumpMax );
                lookupKey = bumpMin;
            }
            while( bumpMin == min );
            BSONObjBuilder op;
            op.append( "op" , "u" );
            op.appendBool( "b" , false );
            op.append( "ns" , ShardNS::chunk );
            nextVersion.incMinor();  // same as used on donateChunk
            BSONObjBuilder n( op.subobjStart( "o" ) );
            n.append( "_id" , Chunk::genID( ns , bumpMin ) );
            nextVersion.addToBSON( n, "lastmod" );
            n.append( "ns" , ns );
            n.append( "min" , bumpMin );
            n.append( "max" , bumpMax );
            n.append( "shard" , fromShard.getName() );
            n.done();
            BSONObjBuilder q( op.subobjStart( "o2" ) );
            q.append( "_id" , Chunk::genID( ns , bumpMin  ) );
            q.done();
            updates.append( op.obj() );
        }
        updates.done();
        BSONArrayBuilder preCond( cmdBuilder.subarrayStart( "preCondition" ) );
        {
            BSONObjBuilder b;
            b.append( "ns" , ShardNS::chunk );
            b.append( "q" , BSON( "query" << BSON( "ns" << ns ) << "orderby" << BSON( "lastmod" << -1 ) ) );
            {
                BSONObjBuilder bb( b.subobjStart( "res" ) );
                // TODO: For backwards compatibility, we can't yet require an epoch here
                bb.appendTimestamp( "lastmod", maxVersion.toLong() );
                bb.done();
            }
            preCond.append( b.obj() );
        }
        preCond.done();
        BSONObj cmd = cmdBuilder.obj();
        bool ok = false;
        BSONObj cmdResult;//执行更新命令
        scoped_ptr<ScopedDbConnection> conn(
                    ScopedDbConnection::getInternalScopedDbConnection(
                            shardingState.getConfigServer() ) );
            ok = conn->get()->runCommand( "config" , cmd , cmdResult );
            conn->done();
        migrateFromStatus.setInCriticalSection( false );
        // 5.d
        configServer.logChange( "moveChunk.commit" , ns , chunkInfo );
    }
    migrateFromStatus.done();
    {//最后删除本地的这个chunk的数据
        // 6.
        OldDataCleanup c;
        c.secondaryThrottle = secondaryThrottle;
        c.ns = ns;
        c.min = min.getOwned();
        c.max = max.getOwned();
        c.shardKeyPattern = shardKeyPattern.getOwned();
        ClientCursor::find( ns , c.initial );
        if ( c.initial.size() ) {
            boost::thread t( boost::bind( &cleanupOldData , c ) );
        }
        else {
            // 7.
            c.doRemove();
        }
    }
    return true;
}

到这里一个chunk的迁移工作完成,流程相当长,代码还是比较好阅读的,需要注意的是不要将两个端的代码搞混了.

原文链接: mongodb源码分析(二十三)mongos chunk的迁移

作者: yhjj0108,杨浩

yhjj0108

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
mongodb源码分析(二十三)mongos chunk的迁移

本文我们分析一个chunk的迁移，下文中将分析mongodb的shard平衡策略,之所以分开成两篇文章分析是因为chunk的偏移设计命令太多，太长.下面首先来看看chunk的迁移流程.1. 将要迁移chunk端A首先记录chunk迁移数据的位置.2. 通知远端B,让其执行_recvChunkStart开始chunk的迁移.3. B端首先从A端system.indexes读取索引,并将
复制链接

扫一扫