本来简单讲讲mongos对于查询 添加 的流程,修改和删除的处理流程简单其也与添加差不多不再分析,对于添加 修改和删除,mongos都只是将其发往正确的mongod服务器让其处理,对于查询稍微麻烦点,因为查询多个mongod服务器的结果回来时汇总需要mongos自身完成其排序.下面来看具体代码吧,在mongos的初始化部分我们已经知道向mongos发送的请求,其处理函数是Request::process函数,下面来看看这个函数.
void Request::process( int attempt ) {
init();//初始化,加载要操作的数据库的DBConfig,对于sharded collection还要加载其chunkmanager
int op = _m.operation();
if ( op == dbKillCursors ) {
cursorCache.gotKillCursors( _m );
return;
}
int msgId = (int)(_m.header()->id);
Timer t;
Strategy * s = SHARDED;
_counter = &opsNonSharded;
_d.markSet();
bool iscmd = false;
if ( op == dbQuery ) {
iscmd = isCommand();
s->queryOp( *this );
}
else if ( op == dbGetMore ) {
checkAuth( Auth::READ ); // this is important so someone can't steal a cursor
s->getMore( *this );
}
else {
checkAuth( Auth::WRITE );
s->writeOp( op, *this );
}
globalOpCounters.gotOp( op , iscmd );
_counter->gotOp( op , iscmd );
}
首先来看查询dbQuery部分.这里查询调用的是ShardStrategy::queryOp
virtual void queryOp( Request& r ) {
// TODO: These probably should just be handled here.
if ( r.isCommand() ) {//对于mongos的命令请求,不再分析
SINGLE->queryOp( r );
return;
}
QueryMessage q( r.d() );
r.checkAuth( Auth::READ );
QuerySpec qSpec( (string)q.ns, q.query, q.fields, q.ntoskip, q.ntoreturn, q.queryOptions );
if ( _isSystemIndexes( q.ns ) && q.query["ns"].type() == String && r.getConfig()->isSharded( q.query["ns"].String() ) ) {
// if you are querying on system.indexes, we need to make sure we go to a shard that actually has chunks
// this is not a perfect solution (what if you just look at all indexes)
// but better than doing nothing
ShardPtr myShard;//查询的是xx.system.indexes这个collection,这个collection保存了所有索引信息.
ChunkManagerPtr cm;//每一个ns得到的要么是chunkManager(对应ns分片了),要么是shard(对应的ns未分片)
r.getConfig()->getChunkManagerOrPrimary( q.query["ns"].String(), cm, myShard );
if ( cm ) {
set<Shard> shards;
cm->getAllShards( shards );
verify( shards.size() > 0 );
myShard.reset( new Shard( *shards.begin() ) );
}
doQuery( r, *myShard );//将查询发往myShard
return;
}//并行cursor的初始化
ParallelSortClusteredCursor * cursor = new ParallelSortClusteredCursor( qSpec, CommandInfo() );
// TODO: Move out to Request itself, not strategy based
{
long long start_millis = 0;
if ( qSpec.isExplain() ) start_millis = curTimeMillis64();
cursor->init();//这里内部多cursor的建立,每一个shard一个cursor
shardedCursorTypes.hit( cursor->type() );
if ( qSpec.isExplain() ) {
// fetch elapsed time for the query
long long elapsed_millis = curTimeMillis64() - start_millis;
BSONObjBuilder explain_builder;
cursor->explain( explain_builder );
explain_builder.appendNumber( "millis", elapsed_millis );
BSONObj b = explain_builder.obj();
replyToQuery( 0 , r.p() , r.m() , b );
delete( cursor );
return;
}
}
if( cursor->isSharded() ){
ShardedClientCursorPtr cc (new ShardedClientCursor( q , cursor ));
BufBuilder buffer( ShardedClientCursor::INIT_REPLY_BUFFER_SIZE );
int docCount = 0;
const int startFrom = cc->getTotalSent();//从cursor中读取数据填充buffer
bool hasMore = cc->sendNextBatch( r, q.ntoreturn, buffer, docCount );
if ( hasMore ) {//有更多数据则先将其保存起来,下次再用.一直不操作600s后将被清除
cursorCache.store( cc );
}
replyToQuery( 0, r.p(), r.m(), buffer.buf(), buffer.len(), docCount,
startFrom, hasMore ? cc->getId() : 0 );
}
else{//该collection未分片,则其在在一台服务器上.
// Remote cursors are stored remotely, we shouldn't need this around.
// TODO: we should probably just make cursor an auto_ptr
scoped_ptr<ParallelSortClusteredCursor> cursorDeleter( cursor );
// TODO: Better merge this logic. We potentially can now use the same cursor logic for everything.
ShardPtr primary = cursor->getPrimary();//得到其所在的服务器
DBClientCursorPtr shardCursor = cursor->getShardCursor( *primary );
// Implicitly stores the cursor in the cache
r.reply( *(shardCursor->getMessage()) , shardCursor->originalHost() );
// We don't want to kill the cursor remotely if there's still data left
shardCursor->decouple();
}
}
这里我们分析ParallelSortClusteredCursor,分析时我们只分析sharded collection,非sharded collection不考虑.ParallelSortClusteredCursor::init调用的是其父亲ClusteredCursor的init,过程是初始化了后不再初始化,其又调用了ParallelSortClusteredCursor::_init,这里直接到_init函数.
void ParallelSortClusteredCursor::_init() {
if( ! _qSpec.isEmpty() ) fullInit();//查询条件不为空则调用fullInit.
else _oldInit();
}
void ParallelSortClusteredCursor::fullInit(){
startInit();
finishInit();//各种检查,最重要的是将startInit得到的cursor保存到_cursor[]中
}
_init->startInit
void ParallelSortClusteredCursor::startInit() {
bool returnPartial = ( _qSpec.options() & QueryOption_PartialResults );
bool specialVersion = _cInfo.versionedNS.size() > 0;
bool specialFilter = ! _cInfo.cmdFilter.isEmpty();
NamespaceString ns = specialVersion ? _cInfo.versionedNS : _qSpec.ns();
ChunkManagerPtr manager;
ShardPtr primary;
string prefix;
set<Shard> todoStorage;
set<Shard>& todo = todoStorage;
string vinfo;
if( isVersioned() ){//创建ParallelSortClusteredCursor对象时未传入shards则走这里,我们当前就走这里.
DBConfigPtr config = grid.getDBConfig( ns.db ); // Gets or loads the config
// Try to get either the chunk manager or the primary shard
config->getChunkManagerOrPrimary( ns, manager, primary );//sharded collection得到chunkManager,否则得到其primary shard
//这里是根据查询条件建立FieldRangeSetPair,在分析查询时说到过这个结构,其是用来标记每一个查询条件的范围的
//这里通过查询范围和chunkManager得到要查询的chunk然后找到其所在的shard从而确定这次查询将发往哪些shards
if( manager ) manager->getShardsForQuery( todo, specialFilter ? _cInfo.cmdFilter : _qSpec.filter() );
else if( primary ) todo.insert( *primary );
// Close all cursors on extra shards first, as these will be invalid
for( map< Shard, PCMData >::iterator i = _cursorMap.begin(), end = _cursorMap.end(); i != end; ++i ){
if( todo.find( i->first ) == todo.end() ) i->second.cleanup();
}
}
else
todo = _qShards;
// Don't retry indefinitely for whatever reason
_totalTries++;
for( set<Shard>::iterator i = todo.begin(), end = todo.end(); i != end; ++i ){
const Shard& shard = *i;
PCMData& mdata = _cursorMap[ shard ];
// This may be the first time connecting to this shard, if so we can get an error here
{
if( mdata.initialized ){//初始化部分,检查chunkmanager版本是否改变了(发生了split或者chunk move)
PCStatePtr state = mdata.pcState;
bool compatiblePrimary = true;
bool compatibleManager = true;
// Only check for compatibility if we aren't forcing the shard choices
if( isVersioned() ){
compatiblePrimary = primary && state->primary && primary == state->primary;
compatibleManager = manager && state->manager && manager->compatibleWith( state->manager, shard );
}
if( compatiblePrimary || compatibleManager ){
// If we're compatible, don't need to retry unless forced
if( ! mdata.retryNext ) continue;
// Do partial cleanup
mdata.cleanup( false );
}
else
mdata.cleanup();
}
else
mdata.cleanup( false );
mdata.pcState.reset( new PCState() );
PCStatePtr state = mdata.pcState;
//建立连接,并且告知将要查询的shard自己的version,若其version不兼容(发生过chunk move),
//其将从configserver中加载新的信息
setupVersionAndHandleSlaveOk( state, shard, primary, ns, vinfo, manager );
const string& ns = _qSpec.ns();
// Setup cursor
if( ! state->cursor ){//这里正式的建立了cursor.
// Do a sharded query if this is not a primary shard *and* this is a versioned query,
// or if the number of shards to query is > 1
if( ( isVersioned() && ! primary ) || _qShards.size() > 1 ){
state->cursor.reset( new DBClientCursor( state->conn->get(), ns, _qSpec.query(),
isCommand() ? 1 : 0, // nToReturn (0 if query indicates multi)
0, // nToSkip
// Does this need to be a ptr?
_qSpec.fields().isEmpty() ? 0 : _qSpec.fieldsData(), // fieldsToReturn
_qSpec.options(), // options
// NtoReturn is weird.
// If zero, it means use default size, so we do that for all cursors
// If positive, it's the batch size (we don't want this cursor limiting results), that's
// done at a higher level
// If negative, it's the batch size, but we don't create a cursor - so we don't want
// to create a child cursor either.
// Either way, if non-zero, we want to pull back the batch size + the skip amount as
// quickly as possible. Potentially, for a cursor on a single shard or if we keep better track of
// chunks, we can actually add the skip value into the cursor and/or make some assumptions about the
// return value size ( (batch size + skip amount) / num_servers ).
_qSpec.ntoreturn() == 0 ? 0 :
( _qSpec.ntoreturn() > 0 ? _qSpec.ntoreturn() + _qSpec.ntoskip() :
_qSpec.ntoreturn() - _qSpec.ntoskip() ) ) ); // batchSize
}
else{
// Non-sharded
state->cursor.reset( new DBClientCursor( state->conn->get(), ns, _qSpec.query(),
_qSpec.ntoreturn(), // nToReturn
_qSpec.ntoskip(), // nToSkip
// Does this need to be a ptr?
_qSpec.fields().isEmpty() ? 0 : _qSpec.fieldsData(), // fieldsToReturn
_qSpec.options(), // options
0 ) ); // batchSize
}
}
bool lazyInit = state->conn->get()->lazySupported();
if( lazyInit ){
// Need to keep track if this is a second or third try for replica sets
state->cursor->initLazy( mdata.retryNext );
mdata.retryNext = false;
mdata.initialized = true;
}
else {
bool success = false;
if( nsGetCollection( ns ) == "$cmd" )
success = state->cursor->initCommand();
else
success = state->cursor->init();//发送查询消息
mdata.retryNext = false;
mdata.initialized = true;
mdata.finished = true;
}
}
}
}
finishInit不再分析,直接回到queryOp->sendNextBatch
bool ShardedClientCursor::sendNextBatch( Request& r , int ntoreturn ,
BufBuilder& buffer, int& docCount ) {//流程是从cursor中读出数据,然后将其
int maxSize = 1024 * 1024;//添加到buffer中,因为来自shard的数据肯定是已经满足查询
if ( _totalSent > 0 )//条件了,这里需要注意的是这里的_cursor其实是ParallelSortClusteredCursor
maxSize *= 3;
docCount = 0;
// Send more if ntoreturn is 0, or any value > 1
// (one is assumed to be a single doc return, with no cursor)
bool sendMore = ntoreturn == 0 || ntoreturn > 1;
ntoreturn = abs( ntoreturn );
while ( _cursor->more() ) {
BSONObj o = _cursor->next();
buffer.appendBuf( (void*)o.objdata() , o.objsize() );
docCount++;
if ( buffer.len() > maxSize ) {
break;
}
if ( docCount == ntoreturn ) {
// soft limit aka batch size
break;
}
if ( ntoreturn == 0 && _totalSent == 0 && docCount >= 100 ) {
// first batch should be max 100 unless batch size specified
break;
}
}
bool hasMore = sendMore && _cursor->more();
_totalSent += docCount;
_done = ! hasMore;
return hasMore;
}
我们接着看ParallelSortClusteredCursor::next,其就是每次从所有查询的server中返回一个安装sort排序的值回来.
BSONObj ParallelSortClusteredCursor::next() {
BSONObj best = BSONObj();
int bestFrom = -1;
for( int j = 0; j < _numServers; j++ ){//多个shard循环读取每一个的数据
// Iterate _numServers times, starting one past the last server we used.
// This means we actually start at server #1, not #0, but shouldn't matter
int i = ( j + _lastFrom + 1 ) % _numServers;
if ( ! _cursors[i].more() ){//这个shard中的数据已经查询完了,标记为done
if( _cursors[i].rawMData() )
_cursors[i].rawMData()->pcState->done = true;
continue;
}
BSONObj me = _cursors[i].peek();//peek而不是将其数据取出
if ( best.isEmpty() ) {
best = me;
bestFrom = i;
if( _sortKey.isEmpty() ) break;
continue;
}
int comp = best.woSortOrder( me , _sortKey , true );//按照sortKey的顺序选取最小的为best
if ( comp < 0 )
continue;
best = me;
bestFrom = i;
}
_lastFrom = bestFrom;
_cursors[bestFrom].next();//实际取出bestFrom中的数据,之前的peek并未真正的在cursor中增加游标
if( _cursors[bestFrom].rawMData() )
_cursors[bestFrom].rawMData()->pcState->count++;
return best;
}
到这里mongos的查询讲解完毕,继续mongos的插入.
virtual void writeOp( int op , Request& r ) {
const char *ns = r.getns();
bool isIndexWrite = _isSystemIndexes( ns );
if( isIndexWrite ){
if ( r.getConfig()->isShardingEnabled() ){
handleIndexWrite( op , r );
return;
}
SINGLE->doWrite( op , r , Shard( r.getConfig()->getPrimary() ) );
r.gotInsert(); // Won't handle mulit-insert correctly. Not worth parsing the request.
return;
}
else{
DbMessage& d = r.d();
if ( op == dbInsert )
_insert( r , d );//插入操作的入口
else if ( op == dbUpdate )
_update( r , d );
else if ( op == dbDelete )
_delete( r , d );
else
throw UserException( 8016 , "can't do this write op on sharded collection" );
return;
}
}
writeOp->_insert
void _insert( Request& r , DbMessage& d ){
const string& ns = r.getns();
vector<BSONObj> insertsRemaining;
while ( d.moreJSObjs() )//读出所有插入的文档
insertsRemaining.push_back( d.nextJsObj() );
int flags = 0;//是否要在插入错误时继续插入操作
if( d.reservedField() & Reserved_InsertOption_ContinueOnError )
flags |= InsertOption_ContinueOnError;
if( d.reservedField() & Reserved_FromWriteback )
flags |= WriteOption_FromWriteback;
_insert( ns, insertsRemaining, flags, r, d );
}
void _insert( const string& ns,vector<BSONObj>& inserts,int flags,Request& r , DbMessage& d ){
map<ChunkPtr, vector<BSONObj> > insertsForChunks; // Map for bulk inserts to diff chunks
_insert( ns, inserts, insertsForChunks, flags, r, d );
}
void _insert( const string& ns,
vector<BSONObj>& insertsRemaining,
map<ChunkPtr, vector<BSONObj> >& insertsForChunks,
int flags,
Request& r, DbMessage& d, // TODO: remove
int retries = 0 ){
ChunkManagerPtr manager;
ShardPtr primary;
// This function handles grouping the inserts per-shard whether the collection is sharded or not.
//将要插入的数据按照其范围分别将其归类对应的队列中如这里的map<ChunkPtr, vector<BSONObj> >
//安装chunk归类要插入的数据,然后将其发送到正确的shard中做插入
_groupInserts( ns, insertsRemaining, insertsForChunks, manager, primary );
// ContinueOnError is always on when using sharding.
flags |= manager ? InsertOption_ContinueOnError : 0;
while( ! insertsForChunks.empty() ){
ChunkPtr c = insertsForChunks.begin()->first;
vector<BSONObj>& objs = insertsForChunks.begin()->second;
const Shard& shard = c ? c->getShard() : primary.get();//通过chunk找到shard
ShardConnection dbcon( shard, ns, manager );//建立连接
try {
// Taken from single-shard bulk insert, should not need multiple methods in future
// insert( c->getShard() , r.getns() , objs , flags);
// It's okay if the version is set here, an exception will be thrown if the version is incompatible
dbcon.setVersion();//首先通知对方自己的version,兼容才能插入
// Certain conn types can't handle bulk inserts, so don't use unless we need to
if( objs.size() == 1 )//正在的插入动作
dbcon->insert( ns, objs[0], flags );
else
dbcon->insert( ns , objs , flags);
dbcon.done();
int bytesWritten = 0;
for (vector<BSONObj>::iterator vecIt = objs.begin(); vecIt != objs.end(); ++vecIt) {
r.gotInsert(); // Record the correct number of individual inserts
bytesWritten += (*vecIt).objsize();
}
// TODO: The only reason we're grouping by chunks here is for auto-split, more efficient
// to track this separately and bulk insert to shards
if ( c && r.getClientInfo()->autoSplitOk() )
c->splitIfShould( bytesWritten );//可拆分时拆分chunk
insertsForChunks.erase( insertsForChunks.begin() );
}
}
}
继续看这里的splitIfShould函数,其将在需要的时候对chunk进行拆分.
bool Chunk::splitIfShould( long dataWritten ) const {
LastError::Disabled d( lastError.get() );
try {
_dataWritten += dataWritten;//总的数据写入量
int splitThreshold = getManager()->getCurrentDesiredChunkSize();
if ( minIsInf() || maxIsInf() )//chunk范围的大小都是无限表明chunk是最初的chunk,未发生分片
splitThreshold = (int) ((double)splitThreshold * .9);
if ( _dataWritten < splitThreshold / ChunkManager::SplitHeuristics::splitTestFactor )
return false;
TicketHolderReleaser releaser( &(getManager()->_splitHeuristics._splitTickets) );
// this is a bit ugly
// we need it so that mongos blocks for the writes to actually be committed
// this does mean mongos has more back pressure than mongod alone
// since it nots 100% tcp queue bound
// this was implicit before since we did a splitVector on the same socket
ShardConnection::sync();
BSONObj res;//正在的分片
BSONObj splitPoint = singleSplit( false /* does not force a split if not enough data */ , res );
if ( splitPoint.isEmpty() ) {
// singleSplit would have issued a message if we got here
_dataWritten = 0; // this means there wasn't enough data to split, so don't want to try again until considerable more data
return false;
}
if ( maxIsInf() || minIsInf() ) {
// we don't want to reset _dataWritten since we kind of want to check the other side right away
}
else
_dataWritten = 0; // we're splitting, so should wait a bit
bool shouldBalance = grid.shouldBalance( _manager->getns() );//是否需要做chunk迁移以达到shard间数据的平衡
BSONElement shouldMigrate = res["shouldMigrate"]; // not in mongod < 1.9.1 but that is ok
if ( ! shouldMigrate.eoo() && shouldBalance ){
BSONObj range = shouldMigrate.embeddedObject();
BSONObj min = range["min"].embeddedObject();
BSONObj max = range["max"].embeddedObject();
// reload sharding metadata before starting migration
Shard::reloadShardInfo();
Shard newLocation = Shard::pick( getShard() );//选择一个map数据最少的shard
if ( getShard() == newLocation ) {
// if this is the best shard, then we shouldn't do anything (Shard::pick already logged our shard).
return true; // we did split even if we didn't migrate
}
ChunkManagerPtr cm = _manager->reload(false/*just reloaded in mulitsplit*/);
ChunkPtr toMove = cm->findChunk(min);
if ( ! (toMove->getMin() == min && toMove->getMax() == max) )
return true;
BSONObj res;
massert( 10412 ,
str::stream() << "moveAndCommit failed: " << res ,
toMove->moveAndCommit( newLocation , //数据的迁移
MaxChunkSize ,
false , /* secondaryThrottle - small chunk, no need */
res ) );
// update our config
_manager->reload();
}
return true;
}
}
到这里查询添加部分讲解完毕,下一篇文章将分析chunk的拆分,将接着从这里的singleSplit函数分析.
原文链接:mongodb源码分析(二十一)mongos查询与添加
作者: yhjj0108,杨浩