本文将分析mongodb中数据平衡的策略.先来看看流程.mongodb开启一线程banlance专门负责数据的平衡工作,其查看系统中所有的shard,发现有不平衡的情况就选择将其中shard服务器的chunk迁移到其它服务器让整个系统达到平衡.来看看平衡策略.
1. shard数据大小超过了shard配置的数据大小,从中选取chunk迁移到别处.
2. 找到shard中有违法tag规则的chunk,将这些chunk迁移到符合tag规则的shard中.
3. 找出所有tag(加入了一个空的tag以达到若不存在tag时有tag可判断的情况)中每一
个tag中chunk数最多的shard上的chunk数目记着Max,找出同样tag中chunk数最少chunk
的shard的chunk数目记着min.总chunk数目记着total.当max-min>=threshold(下面代码描述的)
时,将宣传从max所在shard迁移一个chunk到min所在shard.
//balancedLastTime表示上一个循环是否发生了chunk迁移,发生了则其中记录了迁移的chunk数
if(balancedLastTime || total<20)
threshold=2;
if(total<80);
threshold=4;
else
threshold=8;
下面就来具体分析源码.源码位置为mongo\s\balance.cpp run函数,其在mongos初始化时被启动,作为一个单独的线程来管理系统的平衡.
void Balancer::run() {
// this is the body of a BackgroundJob so if we throw here we're basically ending the balancer thread prematurely
while ( ! inShutdown() ) {
if ( ! _init() ) {//初始化不成功则等待1min继续初始化
sleepsecs( 60 );
continue;
}
break;
}
int sleepTime = 30;
// getConnectioString and dist lock constructor does not throw, which is what we expect on while
// on the balancer thread
ConnectionString config = configServer.getConnectionString();
DistributedLock balanceLock( config , "balancer" );
while ( ! inShutdown() ) {
try {
scoped_ptr<ScopedDbConnection> connPtr(//连接配置服务器
ScopedDbConnection::getInternalScopedDbConnection( config.toString() ) );
ScopedDbConnection& conn = *connPtr;
// ping has to be first so we keep things in the config server in sync
_ping( conn.conn() );//更新config.mongos的ping值,保持自己的连接状态
// use fresh shard state
Shard::reloadShardInfo();
// refresh chunk size (even though another balancer might be active)
Chunk::refreshChunkSize();
BSONObj balancerConfig;
// now make sure we should even be running
//当前正在做平衡或者当前collection设置了不需要平衡或者系统设置了不需要自动
//平衡,则这里返回false,否则true.
if ( ! grid.shouldBalance( "", &balancerConfig ) ) {
// Ping again so scripts can determine if we're active without waiting
_ping( conn.conn(), true );
conn.done();
sleepsecs( sleepTime );
continue;
}
sleepTime = balancerConfig["_nosleep"].trueValue() ? 30 : 6;
{//分布式锁
dist_lock_try lk( &balanceLock , "doing balance round" );
if ( ! lk.got() ) {//没有得到锁
// Ping again so scripts can determine if we're active without waiting
_ping( conn.conn(), true );
conn.done();
sleepsecs( sleepTime ); // no need to wake up soon
continue;
}
vector<CandidateChunkPtr> candidateChunks;//这里选择的CandidateChunk中最多每一个collection一个
_doBalanceRound( conn.conn() , &candidateChunks );//这里的vector中不会同时出现一个collection中两个
if ( candidateChunks.size() == 0 ) { //chunks的情况
_balancedLastTime = 0;
}
else {//这里设置
_balancedLastTime = _moveChunks( &candidateChunks, balancerConfig["_secondaryThrottle"].trueValue() );
}
}
// Ping again so scripts can determine if we're active without waiting
_ping( conn.conn(), true );
conn.done();
sleepsecs( _balancedLastTime ? sleepTime / 6 : sleepTime );
}
catch ( std::exception& e ) {
sleepsecs( sleepTime ); // sleep a fair amount b/c of error
continue;
}
}
}
run->_doBalanceRound
void Balancer::_doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks ) {
// 1. Check whether there is any sharded collection to be balanced by querying
// the ShardsNS::collections collection
auto_ptr<DBClientCursor> cursor = conn.query( ShardNS::collection , BSONObj() );
vector< string > collections;
while ( cursor->more() ) {
BSONObj col = cursor->nextSafe();
// sharded collections will have a shard "key".
if ( ! col["key"].eoo() && ! col["noBalance"].trueValue() )//若collection没有配置不能迁移则将其先记录
collections.push_back( col["_id"].String() );
}
cursor.reset();
if ( collections.empty() )
return;
// 2. Get a list of all the shards that are participating in this balance round
// along with any maximum allowed quotas and current utilization. We get the
// latter by issuing db.serverStatus() (mem.mapped) to all shards.
vector<Shard> allShards;
Shard::getAllShards( allShards );
if ( allShards.size() < 2) //总共才1个shard无法完成迁移工作
return;
ShardInfoMap shardInfo;
for ( vector<Shard>::const_iterator it = allShards.begin(); it != allShards.end(); ++it ) {
const Shard& s = *it;
ShardStatus status = s.getStatus();//得到所有shard的配置信息
shardInfo[ s.getName() ] = ShardInfo( s.getMaxSize(),
status.mapped(),
s.isDraining(),
status.hasOpsQueued(),//表示其中还有待写回的数据,下文将会分析到
s.tags()
);
}
// 3. For each collection, check if the balancing policy recommends moving anything around.
for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it ) {
const string& ns = *it;
map< string,vector<BSONObj> > shardToChunksMap;
cursor = conn.query( ShardNS::chunk , QUERY( "ns" << ns ).sort( "min" ) );
while ( cursor->more() ) {
BSONObj chunk = cursor->nextSafe();
if ( chunk["jumbo"].trueValue() )//这个chunk过大,上一次迁移失败将其标记为了jumbo
continue;
vector<BSONObj>& chunks = shardToChunksMap[chunk["shard"].String()];
chunks.push_back( chunk.getOwned() );
}
cursor.reset();
if (shardToChunksMap.empty())
continue;
for ( vector<Shard>::iterator i=allShards.begin(); i!=allShards.end(); ++i ) {
// this just makes sure there is an entry in shardToChunksMap for every shard
Shard s = *i;
shardToChunksMap[s.getName()].size();
}
DistributionStatus status( shardInfo, shardToChunksMap );
// load tags,确保tags上建好了索引,这里加载tags条件
conn.ensureIndex( ShardNS::tags, BSON( "ns" << 1 << "min" << 1 ), true );
cursor = conn.query( ShardNS::tags , QUERY( "ns" << ns ).sort( "min" ) );
while ( cursor->more() ) {
BSONObj tag = cursor->nextSafe();
uassert( 16356 , str::stream() << "tag ranges not valid for: " << ns ,
status.addTagRange( TagRange( tag["min"].Obj().getOwned(),
tag["max"].Obj().getOwned(),
tag["tag"].String() ) ) );
}
cursor.reset();
CandidateChunk* p = _policy->balance( ns, status, _balancedLastTime );
if ( p ) candidateChunks->push_back( CandidateChunkPtr( p ) );
}
}
run->_doBalanceRound->BalancerPolicy::balance
MigrateInfo* BalancerPolicy::balance( const string& ns,
const DistributionStatus& distribution,
int balancedLastTime ) {
// 1) check for shards that policy require to us to move off of
// draining, maxSize
// 2) check tag policy violations
// 3) then we make sure chunks are balanced for each tag
// 1) check things we have to move
{//第一个move条件,shard的大小超出了配置的大小
const set<string>& shards = distribution.shards();
for ( set<string>::const_iterator z = shards.begin(); z != shards.end(); ++z ) {
string shard = *z;
const ShardInfo& info = distribution.shardInfo( shard );
if ( ! info.isSizeMaxed() && ! info.isDraining() )//当前shard的大小没有超过shard配置大小,maxsize=0表示shard可无限大
continue;
if ( distribution.numberOfChunksInShard( shard ) == 0 )//这个shard没有chunks
continue;
// now we know we need to move to chunks off this shard
// we will if we are allowed
if ( info.hasOpsQueued() )
continue;
const vector<BSONObj>& chunks = distribution.getChunks( shard );
// since we have to move all chunks, lets just do in order
for ( unsigned i=0; i<chunks.size(); i++ ) {//按照tags要求找到一个最适合的shard,将其迁移到这个shard上
BSONObj chunkToMove = chunks[i];
string tag = distribution.getTagForChunk( chunkToMove );
string to = distribution.getBestReceieverShard( tag );
if ( to.size() == 0 )
continue;
return new MigrateInfo( ns, to, shard, chunkToMove.getOwned() );
}
}
}
//tag不对,需要移动
// 2) tag violations
if ( distribution.tags().size() > 0 ) {
const set<string>& shards = distribution.shards();
for ( set<string>::const_iterator i = shards.begin(); i != shards.end(); ++i ) {
string shard = *i;
const ShardInfo& info = distribution.shardInfo( shard );
const vector<BSONObj>& chunks = distribution.getChunks( shard );
for ( unsigned j = 0; j < chunks.size(); j++ ) {
string tag = distribution.getTagForChunk( chunks[j] );
if ( info.hasTag( tag ) )//tags满足这个规则,否则将其迁移
continue;
// uh oh, this chunk is in the wrong place
string to = distribution.getBestReceieverShard( tag );
if ( to.size() == 0 )
continue;
return new MigrateInfo( ns, to, shard, chunks[j].getOwned() );
}
}
}
// 3) for each tag balance
//根据tag的balance,选取tag中chunks数目中最多的chunks和最小的chunks
//当最多的chunks A与最小的chunks B形成A>B+threshold时,从A中迁移一个
//chunks满足tag的chunk到B
int threshold = 8;
if ( balancedLastTime || distribution.totalChunks() < 20 )
threshold = 2;
else if ( distribution.totalChunks() < 80 )
threshold = 4;
// randomize the order in which we balance the tags
// this is so that one bad tag doesn't prevent others from getting balanced
vector<string> tags;
{
set<string> t = distribution.tags();
for ( set<string>::const_iterator i = t.begin(); i != t.end(); ++i )
tags.push_back( *i );
tags.push_back( "" );//打乱顺序,避免每次按照同样的规则迁移出现问题
std::random_shuffle( tags.begin(), tags.end() );
}
for ( unsigned i=0; i<tags.size(); i++ ) {
string tag = tags[i];
string from = distribution.getMostOverloadedShard( tag );//这个tag中数据最多的shard名
if ( from.size() == 0 )
continue;
unsigned max = distribution.numberOfChunksInShardWithTag( from, tag );
if ( max == 0 )
continue;
string to = distribution.getBestReceieverShard( tag );
if ( to.size() == 0 )
return NULL;
unsigned min = distribution.numberOfChunksInShardWithTag( to, tag );
const int imbalance = max - min;
if ( imbalance < threshold )//相差达到了threshold,将其迁移
continue;
//超过了threshold,选取一个shard中tag为该tag的chunks作为迁移对象
const vector<BSONObj>& chunks = distribution.getChunks( from );
for ( unsigned j = 0; j < chunks.size(); j++ ) {
if ( distribution.getTagForChunk( chunks[j] ) != tag )
continue;
return new MigrateInfo( ns, to, from, chunks[j] );
}
}
// Everything is balanced here!
return NULL;
}
回到run函数继续看_moveChunks函数.
int Balancer::_moveChunks( const vector<CandidateChunkPtr>* candidateChunks , bool secondaryThrottle ) {
int movedCount = 0;
for ( vector<CandidateChunkPtr>::const_iterator it = candidateChunks->begin(); it != candidateChunks->end(); ++it ) {
const CandidateChunk& chunkInfo = *it->get();
DBConfigPtr cfg = grid.getDBConfig( chunkInfo.ns );//加载相应的配置与chunkManager
ChunkManagerPtr cm = cfg->getChunkManager( chunkInfo.ns );
ChunkPtr c = cm->findChunk( chunkInfo.chunk.min );//比照不等说明发生了chunk的拆分,不能再迁移这个chunk了
if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) {
// likely a split happened somewhere
cm = cfg->getChunkManager( chunkInfo.ns , true /* reload */);
c = cm->findChunk( chunkInfo.chunk.min );
if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) )
continue;
}
BSONObj res;//真正的chunk迁移,前文已经分析,这里不再分析
if ( c->moveAndCommit( Shard::make( chunkInfo.to ) , Chunk::MaxChunkSize , secondaryThrottle , res ) ) {
movedCount++;//记录这次迁移的chunk数
continue;
}
// the move requires acquiring the collection metadata's lock, which can fail
if ( res["chunkTooBig"].trueValue() ) {//迁移失败,太大了,将其拆分
// reload just to be safe
cm = cfg->getChunkManager( chunkInfo.ns );
c = cm->findChunk( chunkInfo.chunk.min );
res = BSONObj();
c->singleSplit( true , res );
if ( ! res["ok"].trueValue() ) {//拆分失败,将其标记为jumbo
c->markAsJumbo();
// we increment moveCount so we do another round right away
movedCount++;
}
}
}
return movedCount;
}
到这里mongodb数据的平衡分析完毕,因为前面一系列分析文章的基础,这里的流程还是挺简单的.
原文链接:mongodb源码分析(二十四)mongos数据的平衡
作者: yhjj0108,杨浩