上一篇文章分析了replset的初始化,下面我们继续分析replset的同步部分,这里涉及到2个线程,一个函数.
producerThread: 对于非primary的服务器,选取一个目标服务器并从其读出操作日志.
startsyncthread: 从producerthread处读取操作日志然后replay.
msgCheckNewState函数: 负责各个服务器状态的切换,secondary<->primary.
首先来看看producerThread,其内部循环调用_producerThread取出操作日志,前进到_producerThread
void BackgroundSync::_producerThread() {
MemberState state = theReplSet->state();
// we want to pause when the state changes to primary
if (state.primary()) {//primary服务器不需要同步其它,暂停
if (!_pause) {
stop();
}
sleepsecs(1);
return;
}
if (state.fatal() || state.startup()) {
sleepsecs(5);
return;
}
// if this member has an empty oplog, we cannot start syncing
if (theReplSet->lastOpTimeWritten.isNull()) {
sleepsecs(1);
return;
}
// we want to unpause when we're no longer primary
// start() also loads _lastOpTimeFetched, which we know is set from the "if"
else if (_pause) {//初始化时_pause为true,这里启动,设置上一次操作的时间,对于初始化就是save config时写入log的时间
start();
}
produce();
}
_producerThread->produce,其内部选取一个synctarget然后从其中读取操作日志然后存入一个
blockqueue中供syncthread同步.
void BackgroundSync::produce() {
// this oplog reader does not do a handshake because we don't want the server it's syncing
// from to track how far it has synced
OplogReader r(false /* doHandshake */);
// find a target to sync from the last op time written
getOplogReader(r);//选取synctarget
// no server found
{
boost::unique_lock<boost::mutex> lock(_mutex);
r.tailingQueryGTE(rsoplog, _lastOpTimeFetched);//查找比当前取得log时间更大的操作日志
}
if (isRollbackRequired(r)) {//sync back的操作,将需要rollback的部分做逆操作
stop(); //不再深入分析,感兴趣的自己阅读
return;
}
while (!inShutdown()) {
while (!inShutdown()) {
if (!r.moreInCurrentBatch()) {//没有更多的数据
if (theReplSet->gotForceSync()) {//rs.syncFrom设置的强行更新的目标存在
return;
}
if (theReplSet->isPrimary()) {//primary不需要sync
return;
}
{
boost::unique_lock<boost::mutex> lock(_mutex);
if (!_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) {
return;
}
}
r.more();
}
if (!r.more())
break;
BSONObj o = r.nextSafe().getOwned();
Timer timer;
_buffer.push(o);//读取的日志记录到_buffer中,供syncthread读取操作
{
boost::unique_lock<boost::mutex> lock(_mutex);
// update counters
_queueCounter.waitTime += timer.millis();
_queueCounter.numElems++;
_lastH = o["h"].numberLong();
_lastOpTimeFetched = o["ts"]._opTime();//更新最新的操作时间戳
}
} // end while
{
boost::unique_lock<boost::mutex> lock(_mutex);
if (_pause || !_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) {
return;
}
}
r.tailCheck();
if( !r.haveCursor() ) {
return;
}
// looping back is ok because this is a tailable cursor
}
}
_producerThread->produce->getOplogReader
void BackgroundSync::getOplogReader(OplogReader& r) {
Member *target = NULL, *stale = NULL;
BSONObj oldest;
// then we're initial syncing and we're still waiting for this to be set
{
boost::unique_lock<boost::mutex> lock(_mutex);
if (_lastOpTimeFetched.isNull()) {
_currentSyncTarget = NULL;
return;
}
}
//选取一个时间戳比自己更新的,且一段时间内能够连接的端坐sync
//若有多个目标则选择ping耗时最少的,强制指定了forcesynctarget,则直接选择该
//target,一旦选取了一个target,将其设置为currentsynctarget,之后除了其无法连接操作
//log过时或者stop,否则将一直保持不变
while ((target = theReplSet->getMemberToSyncTo()) != NULL) {
string current = target->fullName();
if (!r.connect(current)) {//不能连接,一段时间内不考虑其作为sync端
r.resetConnection();
theReplSet->veto(current);
continue;
}
if (isStale(r, oldest)) {//本地取得最新的操作日志过旧,比sync端最旧的操作日志时间还旧
r.resetConnection();
theReplSet->veto(current, 600);
stale = target;
continue;
}
// if we made it here, the target is up and not stale
{
boost::unique_lock<boost::mutex> lock(_mutex);
_currentSyncTarget = target;
}
return;
}
// the only viable sync target was stale
if (stale) {
theReplSet->goStale(stale, oldest);
sleepsecs(120);
}
{
boost::unique_lock<boost::mutex> lock(_mutex);
_currentSyncTarget = NULL;
}
}
_producerThread->produce->getOplogReader->getMemberToSyncTo
Member* ReplSetImpl::getMemberToSyncTo() {
lock lk(this);
bool buildIndexes = true;
// if we have a target we've requested to sync from, use it
if (_forceSyncTarget) {//有强制更新的目标直接选定
Member* target = _forceSyncTarget;
_forceSyncTarget = 0;
return target;
}
// wait for 2N pings before choosing a sync target
if (_cfg) {//ping的次数至少是members的2倍
int needMorePings = config().members.size()*2 - HeartbeatInfo::numPings;
if (needMorePings > 0) {
return NULL;
}
buildIndexes = myConfig().buildIndexes;
}
// find the member with the lowest ping time that has more data than me
// Find primary's oplog time. Reject sync candidates that are more than
// MAX_SLACK_TIME seconds behind.
OpTime primaryOpTime;
static const unsigned maxSlackDurationSeconds = 10 * 60; // 10 minutes
const Member* primary = box.getPrimary();
if (primary)
primaryOpTime = primary->hbinfo().opTime;
else
// choose a time that will exclude no candidates, since we don't see a primary
primaryOpTime = OpTime(maxSlackDurationSeconds, 0);
if ( primaryOpTime.getSecs() < maxSlackDurationSeconds ) {
// erh - I think this means there was just a new election
// and we don't yet know the new primary's optime
primaryOpTime = OpTime(maxSlackDurationSeconds, 0);
}
OpTime oldestSyncOpTime(primaryOpTime.getSecs() - maxSlackDurationSeconds, 0);
Member *closest = 0;
time_t now = 0;
// Make two attempts. The first attempt, we ignore those nodes with
// slave delay higher than our own. The second attempt includes such
// nodes, in case those are the only ones we can reach.
// This loop attempts to set 'closest'.
for (int attempts = 0; attempts < 2; ++attempts) {
for (Member *m = _members.head(); m; m = m->next()) {
if (!m->hbinfo().up())
continue;
// make sure members with buildIndexes sync from other members w/indexes
if (buildIndexes && !m->config().buildIndexes)//有buildIndexes的只能选择buildIndexes的
continue;
if (!m->state().readable())//非primary,secondary不可读
continue;
if (m->state() == MemberState::RS_SECONDARY) {
// only consider secondaries that are ahead of where we are
if (m->hbinfo().opTime <= lastOpTimeWritten)//远端的日志时间戳比自己还旧,不考虑
continue;
// omit secondaries that are excessively behind, on the first attempt at least.
if (attempts == 0 &&
m->hbinfo().opTime < oldestSyncOpTime)
continue;
}
// omit nodes that are more latent than anything we've already considered
if (closest && //之前已经找到了一个sync端,比照ping的时间,选取ping时间更少的一个
(m->hbinfo().ping > closest->hbinfo().ping))
continue;
if ( attempts == 0 &&
myConfig().slaveDelay < m->config().slaveDelay ) {
continue; // skip this one in the first attempt
}
//之前选举过作为同步对象,但是其无法连接,这里要等待vote->second
//时间后才可能再次将其作为候选选项,其时间可以配置,默认为10s.
map<string,time_t>::iterator vetoed = _veto.find(m->fullName());
if (vetoed != _veto.end()) {
// Do some veto housekeeping
if (now == 0) {
now = time(0);
}
// if this was on the veto list, check if it was vetoed in the last "while".
// if it was, skip.
if (vetoed->second >= now) {
continue;
}
_veto.erase(vetoed);
// fall through, this is a valid candidate now
}
// This candidate has passed all tests; set 'closest'
closest = m;
}
if (closest) break; // no need for second attempt
}
if (!closest) {
return NULL;
}
return closest;
}
到这里procedure线程分析完毕,继续startSyncThread,其入口为:ReplSetImpl::syncThread
void ReplSetImpl::syncThread() {
while( 1 ) {
// After a reconfig, we may not be in the replica set anymore, so
// check that we are in the set (and not an arbiter) before
// trying to sync with other replicas.
if( ! _self ) {
sleepsecs(20);
continue;
}
if( myConfig().arbiterOnly ) {//仲裁者不需要同步
return;
}
try {
_syncThread();//真正的同步过程
}
catch(DBException& e) {
sethbmsg(str::stream() << "syncThread: " << e.toString());
sleepsecs(10);
}
catch(...) {
sethbmsg("unexpected exception in syncThread()");
sleepsecs(60);
}
sleepsecs(1);
}
}
void ReplSetImpl::_syncThread() {
StateBox::SP sp = box.get();
if( sp.state.primary() ) {//primary不需要同步
sleepsecs(1);
return;
}//下面这些只有secondary服务器会执行
if( _blockSync || sp.state.fatal() || sp.state.startup() ) {
sleepsecs(5);
return;
}
/* do we have anything at all? */
if( lastOpTimeWritten.isNull() ) {//没有sync的时间,初始化sync,从远端复制数据库
syncDoInitialSync();
return; // _syncThread will be recalled, starts from top again in case sync failed.
}
/* we have some data. continue tailing. */
replset::SyncTail tail(replset::BackgroundSync::get());//这里额度BackgroundSync全局唯一,内部保存了producethread读取的操作日志buffer
tail.oplogApplication();//读取操作日志然后replay
}
_syncThread->syncDoInitialSync
void ReplSetImpl::syncDoInitialSync() {
const static int maxFailedAttempts = 10;
createOplog();//可能的创建local.oplog.rs的过程
int failedAttempts = 0;
while ( failedAttempts < maxFailedAttempts ) {
try {
_syncDoInitialSync();
break;
}
catch(DBException& e) {
failedAttempts++;
sleepsecs(30);
}
}
}
_syncThread->syncDoInitialSync->_syncDoInitialSync
void ReplSetImpl::_syncDoInitialSync() {
replset::InitialSync init(replset::BackgroundSync::get());
// if this is the first node, it may have already become primary
const Member *source = getMemberToSyncTo();
string sourceHostname = source->h().toString();
init.setHostname(sourceHostname);
OplogReader r;
BSONObj lastOp = r.getLastOp(rsoplog);
if (replSettings.fastsync) {//快速同步中仅仅同步最后一个操作,使用这个命令有限制,
log() << "fastsync: skipping database clone" << rsLog;//不注意会造成某些数据不能同步
// prime oplog
init.oplogApplication(lastOp, lastOp);
return;
}
else {
dropAllDatabasesExceptLocal();
list<string> dbs = r.conn()->getDatabaseNames();
//复制primary端的所有除local的数据库,这是一个同步过程,同步复制所有数据库
if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, true ) ) {
veto(source->fullName(), 600);//将这台服务器加入到veto,600s内不尝试从该端sync
sleepsecs(300);
return;
}
BSONObj minValid;//同步lastOp到minValid的操作日志
if ( ! _syncDoInitialSync_applyToHead( init, &r , source , lastOp , minValid ) ) {
return;
}
lastOp = minValid;
// its currently important that lastOp is equal to the last op we actually pulled
// this is because the background thread only pulls each op once now
// so if its now, we'll be waiting forever
{
// this takes whatever the last op the we got is
// and stores it locally before we wipe it out below
Lock::DBRead lk(rsoplog);
Helpers::getLast(rsoplog, lastOp);
lastOp = lastOp.getOwned();
}
// reset state, as that "didn't count"
emptyOplog();
lastOpTimeWritten = OpTime();
lastH = 0;
if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, false ) ) {//克隆index
veto(source->fullName(), 600);
sleepsecs(300);
return;
}
}
BSONObj minValid;
if ( ! _syncDoInitialSync_applyToHead( init, &r, source, lastOp, minValid ) ) {//再次同步日志
return;
}
{
Client::WriteContext cx( "local." );
cx.ctx().db()->flushFiles(true);
Helpers::putSingleton("local.replset.minvalid", minValid);
cx.ctx().db()->flushFiles(true);
}
changeState(MemberState::RS_RECOVERING);
}
_syncThread->syncDoInitialSync->_syncDoInitialSync->_syncDoInitialSync_applyToHead
bool ReplSetImpl::_syncDoInitialSync_applyToHead( replset::InitialSync& init, OplogReader* r,
const Member* source, const BSONObj& lastOp ,
BSONObj& minValid ) {
/* our cloned copy will be strange until we apply oplog events that occurred
through the process. we note that time point here. */
try {
// It may have been a long time since we last used this connection to
// query the oplog, depending on the size of the databases we needed to clone.
// A common problem is that TCP keepalives are set too infrequent, and thus
// our connection here is terminated by a firewall due to inactivity.
// Solution is to increase the TCP keepalive frequency.
minValid = r->getLastOp(rsoplog);
} catch ( SocketException & ) {
if( !r->connect(source->h().toString()) ) {
throw;
}
// retry
minValid = r->getLastOp(rsoplog);
}
OpTime mvoptime = minValid["ts"]._opTime();
OpTime startingTS = lastOp["ts"]._opTime();
// apply startingTS..mvoptime portion of the oplog
init.oplogApplication(lastOp, minValid);//replay操作日志
return true;
}
_syncThread->syncDoInitialSync->_syncDoInitialSync_applyToHead->InitialSync::oplogApplication
void InitialSync::oplogApplication(const BSONObj& applyGTEObj, const BSONObj& minValidObj) {
OpTime applyGTE = applyGTEObj["ts"]._opTime();
OpTime minValid = minValidObj["ts"]._opTime();
syncApply(applyGTEObj);//sync第一条日志
_logOpObjRS(applyGTEObj);//将其记录到本地local.oplog.rs并更新时间lastOpTimeWritten
// if there were no writes during the initial sync, there will be nothing in the queue so
// just go live
if (minValid == applyGTE) {
return;
}
OpTime ts;
time_t start = time(0);
unsigned long long n = 0, lastN = 0;
while( ts < minValid ) {//在克隆数据库或者索引的时候已经产生了其它操作日志
OpQueue ops;
while (ops.getSize() < replBatchSizeBytes) {//等待更多的数据
if (tryPopAndWaitForMore(&ops)) {
break;
}
}
multiApply(ops.getDeque(), multiInitialSyncApply);//使用threadpoll多线程分别调用multiInitialSyncApply去执行操作日志的replay
n += ops.getDeque().size();
if ( n > lastN + 1000 ) {
time_t now = time(0);
if (now - start > 10) {
// simple progress metering
log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to "
<< ts.toStringPretty() << rsLog;
start = now;
lastN = n;
}
}
// we want to keep a record of the last op applied, to compare with minvalid
const BSONObj& lastOp = ops.getDeque().back();
OpTime tempTs = lastOp["ts"]._opTime();
applyOpsToOplog(&ops.getDeque());//将日志记录到本地local.oplog.rs中
ts = tempTs;
}
}
_syncThread->syncDoInitialSync->_syncDoInitialSync_applyToHead->InitialSync::oplogApplication->multiApply
void SyncTail::multiApply( std::deque<BSONObj>& ops, MultiSyncApplyFunc applyFunc ) {
// Use a ThreadPool to prefetch all the operations in a batch.
prefetchOps(ops);//多线程预先将索引和数据加载进内存
std::vector< std::vector<BSONObj> > writerVectors(theReplSet->replWriterThreadCount);
fillWriterVectors(ops, &writerVectors);//将ops分配到writer中,每个writer一个线程
// We must grab this because we're going to grab write locks later.
// We hold this mutex the entire time we're writing; it doesn't matter
// because all readers are blocked anyway.
SimpleMutex::scoped_lock fsynclk(filesLockedFsync);
// stop all readers until we're done
Lock::ParallelBatchWriterMode pbwm;
applyOps(writerVectors, applyFunc);//每个线程调用applyFunc更新操作日志
}
_syncThread->syncDoInitialSync->_syncDoInitialSync_applyToHead->InitialSync::oplogApplication->multiInitialSyncApply
void multiInitialSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) {
initializeWriterThread();
for (std::vector<BSONObj>::const_iterator it = ops.begin();it != ops.end();++it) {
if (!st->syncApply(*it)) {//具体的sync动作
bool status;
{
Lock::GlobalWrite lk;
status = st->shouldRetry(*it);//要操作的对象不存在,可能原因是出于初始化状态,去同步端取得对象插入本地,然后再次操作
}//为什么同步复制数据库的情况下还存在数据对象不存在的情况呢?
if (status) {
// retry
fassert(15915, st->syncApply(*it));
}
// If shouldRetry() returns false, fall through.
// This can happen if the document that was moved and missed by Cloner
// subsequently got deleted and no longer exists on the Sync Target at all
}
}
}
_syncThread->syncDoInitialSync->_syncDoInitialSync_applyToHead->InitialSync::oplogApplication->multiInitialSyncApply->syncApply
bool SyncTail::syncApply(const BSONObj &op, bool convertUpdateToUpsert/*=false*/) {
const char *ns = op.getStringField("ns");
bool isCommand(op["op"].valuestrsafe()[0] == 'c');
boost::scoped_ptr<Lock::ScopedLock> lk;
if(isCommand) {
// a command may need a global write lock. so we will conservatively go
// ahead and grab one here. suboptimal. :-(
lk.reset(new Lock::GlobalWrite());
} else {
// DB level lock for this operation
lk.reset(new Lock::DBWrite(ns));
}
Client::Context ctx(ns, dbpath, false);
ctx.getClient()->curop()->reset();
// For non-initial-sync, we convert updates to upserts
// to suppress errors when replaying oplog entries.
bool ok = !applyOperation_inlock(op, true, convertUpdateToUpsert);//replay操作日志,在master/slave时分析过,不再分析
getDur().commitIfNeeded();
return ok;
}
回到_syncThread非initial状态调用如下,这里的oplogApplication和之前的oplogApplication不同.
/* we have some data. continue tailing. */
replset::SyncTail tail(replset::BackgroundSync::get());
tail.oplogApplication();
void SyncTail::oplogApplication() {
while( 1 ) {
OpQueue ops;
time_t lastTimeChecked = time(0);
// always fetch a few ops first
// tryPopAndWaitForMore returns true when we need to end a batch early
while (!tryPopAndWaitForMore(&ops) &&
(ops.getSize() < replBatchSizeBytes)) {
time_t now = time(0);
// occasionally check some things
if (ops.empty() || now > lastTimeChecked) {
lastTimeChecked = now;
// can we become secondary?
// we have to check this before calling mgr, as we must be a secondary to
// become primary
if (!theReplSet->isSecondary()) {//初始化过程第一次将自己设置成secondary
OpTime minvalid;
theReplSet->tryToGoLiveAsASecondary(minvalid);//设置自己为secondary后通过心跳协议把自己状态传给其它member
}
// normally msgCheckNewState gets called periodically, but in a single node repl set
// there are no heartbeat threads, so we do it here to be sure. this is relevant if the
// singleton member has done a stepDown() and needs to come back up.
if (theReplSet->config().members.size() == 1 &&//只有一个节点,得自己调用函数改变状态
theReplSet->myConfig().potentiallyHot()) {
Manager* mgr = theReplSet->mgr;
// When would mgr be null? During replsettest'ing.
if (mgr) mgr->send(boost::bind(&Manager::msgCheckNewState, theReplSet->mgr));
sleepsecs(1);
return;
}
}
}
const BSONObj& lastOp = ops.getDeque().back();
handleSlaveDelay(lastOp);
// Set minValid to the last op to be applied in this next batch.
// This will cause this node to go into RECOVERING state
// if we should crash and restart before updating the oplog
{
Client::WriteContext cx( "local" );
Helpers::putSingleton("local.replset.minvalid", lastOp);
}
multiApply(ops.getDeque(), multiSyncApply);//实际的replay操作日志
applyOpsToOplog(&ops.getDeque());
}
}
_syncThread->SyncTail::oplogApplication->multiSyncApply
void multiSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) {
initializeWriterThread();
for (std::vector<BSONObj>::const_iterator it = ops.begin();
it != ops.end();
++it) {
try {
fassert(16359, st->syncApply(*it, true));//实际的sync,前面分析过,这里不再分析
} catch (DBException& e) {
error() << "writer worker caught exception: " << e.what()
<< " on: " << it->toString() << endl;
fassertFailed(16360);
}
}
}
到这里同步操作的线程分析完毕,其内部就是读取操作日志然后开启多线程replay.
最后来看看Manager::msgCheckNewState,该函数将设置primary.
void Manager::msgCheckNewState() {
{
RSBase::lock lk(rs);
if( busyWithElectSelf ) return;
checkElectableSet();//将自己加入到候选primary中或移除,找到priority最大的,可能切换primary
checkAuth();//检查不能连接的服务器和认证出问题的服务器
const Member *p = rs->box.getPrimary();
if( p && p != rs->_self ) {//primary不能给连接,将其置空
if( !p->hbinfo().up() ||
!p->hbinfo().hbstate.primary() ) {
p = 0;
rs->box.setOtherPrimary(0);
}
}
const Member *p2;//是否有自己和另一个节点认为自己是primary
bool two;
p2 = findOtherPrimary(two);
if( two ) {
/* two other nodes think they are primary (asynchronously polled) -- wait for things to settle down. */
return;
}
if( p2 ) {//将其设置为primary
noteARemoteIsPrimary(p2);
return;
}
if( p ) {
/* we are already primary */
if( p != rs->_self ) {
return;
}
if( rs->elect.shouldRelinquish() ) {//在线服务器的总投票数已经不到一半,需要让出primary
rs->relinquish();
}
return;
}
if( !rs->iAmPotentiallyHot() ) { // if not we never try to be primary
return;
}
/* no one seems to be primary. shall we try to elect ourself? */
if( !rs->elect.aMajoritySeemsToBeUp() ) {//投票数不足一半,返回
static time_t last;
static int n;
int ll = 0;
if( ++n > 5 ) ll++;
if( last + 60 > time(0 ) ) ll++;
log(ll) << "replSet can't see a majority, will not try to elect self" << rsLog;
last = time(0);
return;
}
if( !rs->iAmElectable() ) {
return;
}
busyWithElectSelf = true; // don't try to do further elections & such while we are already working on one.
}
try {
rs->elect.electSelf();//尝试将自己设置为primary
}
catch(RetryAfterSleepException&) {//重启该task,也就是再次执行该函数
/* we want to process new inbounds before trying this again. so we just put a checkNewstate in the queue for eval later. */
requeue();
}
busyWithElectSelf = false;
}
Manager::msgCheckNewState->checkElectableSet
void Manager::checkElectableSet() {
unsigned otherOp = rs->lastOtherOpTime().getSecs();//找到其他服务器中时间最近的一个
// make sure the electable set is up-to-date
if (rs->elect.aMajoritySeemsToBeUp() &&//自己离最近时间点的服务器在10s内,
rs->iAmPotentiallyHot() &&
(otherOp == 0 || rs->lastOpTimeWritten.getSecs() >= otherOp - 10)) {
theReplSet->addToElectable(rs->selfId());//将自己加入到primary候选中
}
else {
theReplSet->rmFromElectable(rs->selfId());
}
// check if we should ask the primary (possibly ourselves) to step down
const Member *highestPriority = theReplSet->getMostElectable();//找到primary最高的服务器
const Member *primary = rs->box.getPrimary();
if (primary && highestPriority &&
highestPriority->config().priority > primary->config().priority &&
// if we're stepping down to allow another member to become primary, we
// better have another member (otherOp), and it should be up-to-date
otherOp != 0 && highestPriority->hbinfo().opTime.getSecs() >= otherOp - 10) {
if (primary->h().isSelf()) {//自己不是priority最高的,让出primary
// replSetStepDown tries to acquire the same lock
// msgCheckNewState takes, so we can't call replSetStepDown on
// ourselves.
rs->relinquish();
}
else {
BSONObj cmd = BSON( "replSetStepDown" << 1 );
ScopedConn conn(primary->fullName());
BSONObj result;
try {//让远端让出primary
if (!conn.runCommand(
"admin",
cmd,
result,
0,
&AuthenticationTable::getInternalSecurityAuthenticationTable())) {
}
}
}
}
}
回到msgCheckNewState服务器尝试将自身设置为primary.其调用Consensus::electSelf,其内部
继续调用Consensus::_electSelf.
void Consensus::_electSelf() {
bool allUp;
int nTies;
if( !weAreFreshest(allUp, nTies) ) {//自己是否是最新的,比照lasttimewritten
return;
}
if( !allUp && time(0) - started < 60 * 5 ) {//不是所有之前心跳协议没问题的都返回了其时间,对方可能crash了
/* the idea here is that if a bunch of nodes bounce all at once, we don't want to drop data
if we don't have to -- we'd rather be offline and wait a little longer instead
todo: make this configurable.
*/
return;
}
Member& me = *rs._self;
if( nTies ) {//几台服务器最后的写时间一样,随机睡眠一段时间
/* tie? we then randomly sleep to try to not collide on our voting. */
/* todo: smarter. */
if( me.id() == 0 || sleptLast ) {
// would be fine for one node not to sleep
// todo: biggest / highest priority nodes should be the ones that get to not sleep
}
else {
unsigned ms = ((unsigned) rand()) % 1000 + 50;
sleptLast = true;
sleepmillis(ms);
throw RetryAfterSleepException();
}
}
sleptLast = false;
time_t start = time(0);
unsigned meid = me.id();
int tally = yea( meid );
bool success = false;
try {//通过命令replSetElect推举自己为primary
BSONObj electCmd = BSON(
"replSetElect" << 1 <<
"set" << rs.name() <<
"who" << me.fullName() <<
"whoid" << me.hbinfo().id() <<
"cfgver" << rs._cfg->version <<
"round" << OID::gen() /* this is just for diagnostics */
);
int configVersion;
list<Target> L;
rs.getTargets(L, configVersion);
multiCommand(electCmd, L);//开启多个线程执行该命令
{
for( list<Target>::iterator i = L.begin(); i != L.end(); i++ ) {
DEV log() << "replSet elect res: " << i->result.toString() << rsLog;
if( i->ok ) {
int v = i->result["vote"].Int();
tally += v;
}
}
if( tally*2 <= totalVotes() ) {//得票少于半数
log() << "replSet couldn't elect self, only received " << tally << " votes" << rsLog;
}
else if( time(0) - start > 30 ) {//命令时间执行过程
// defensive; should never happen as we have timeouts on connection and operation for our conn
log() << "replSet too much time passed during our election, ignoring result" << rsLog;
}
else if( configVersion != rs.config().version ) {版本不对
log() << "replSet config version changed during our election, ignoring result" << rsLog;
}
else {
/* succeeded. */
log(1) << "replSet election succeeded, assuming primary role" << rsLog;
success = true;
rs.assumePrimary();//将自己设置为primary
}
}
}
catch( std::exception& ) {
if( !success ) electionFailed(meid);
throw;
}
if( !success ) electionFailed(meid);
}
Manager::msgCheckNewState->Consensus::electSelf->Consensus::_electSelf->CmdReplSetElect::run->Consensus::electCmdReceived
void Consensus::electCmdReceived(BSONObj cmd, BSONObjBuilder* _b) {
BSONObjBuilder& b = *_b;
string set = cmd["set"].String();
unsigned whoid = cmd["whoid"].Int();
int cfgver = cmd["cfgver"].Int();
OID round = cmd["round"].OID();
int myver = rs.config().version;
const Member* primary = rs.box.getPrimary();
const Member* hopeful = rs.findById(whoid);
const Member* highestPriority = rs.getMostElectable();
int vote = 0;
else if( myver > cfgver ) {//自己的config已经更新,对方想将自己设置为primary,版本不对投-10000保证它不能成为primary
vote = -10000;
}
else if( !hopeful ) {//自己找不到对方这个member
vote = -10000;
}
else if( primary && primary == rs._self && rs.lastOpTimeWritten >= hopeful->hbinfo().opTime ) {
vote = -10000;//自己是primary,且时间戳更新,不能让其成为primary
}
else if( primary && primary->hbinfo().opTime >= hopeful->hbinfo().opTime ) {
vote = -10000;//存在primary且primary的时间戳更新
}
else if( highestPriority && highestPriority->config().priority > hopeful->config().priority) {
vote = -10000;//存在priority更高的服务器
}
else {
vote = yea(whoid);//投自己的票数
rs.relinquish();
}
b.append("vote", vote);
b.append("round", round);
}
通过这里可以看到要成为primary需要其它服务器同意,并且票数要达到总票数的一半.
到这里ReplSet部分的同步与状态更新讲完了,流程比较繁琐,需要注意心跳部分,producethread,newstate的
状态切换,需要注意priority为0则不可能成为primary,投票数过半服务器才能正常运行.
原文链接:mongodb源码分析(十六)replsetcation replset同步以及状态的切换
作者:yhjj0108,杨浩