Ceph Monitor选主(下)

monitor选主的下半段就是真正的选主

发送OP_PROPOSE请求

void Monitor::start_election()
	_reset();
	elector.call_election();	
		start();
			acked_me.clear();
			init();
				epoch = mon->store->get(Monitor::MONITOR_NAME, "election_epoch");
				if (!epoch) 
					epoch = 1;
				// 如果epoch为奇数,则说明正处在选举阶段,将epoch+1,以跨过这个阶段
				else if (epoch % 2)
					++epoch;
					t->put(Monitor::MONITOR_NAME, "election_epoch", epoch);
					mon->store->apply_transaction(t);
			// 如果为偶数,则加1,表示处于选举阶段
			if (epoch % 2 == 0)
				bump_epoch(epoch+1);
			electing_me = true;
			leader_acked = -1;
			for (unsigned i=0; i<mon->monmap->size(); ++i)
				if ((int)i == mon->rank) continue;
				MMonElection *m = new MMonElection(MMonElection::OP_PROPOSE, epoch, mon->monmap);
				mon->messenger->send_message(m, mon->monmap->get_inst(i));
			reset_timer();

主动启动选举的节点向monmap中的其它节点发送OP_PROPOSE请求,并将选举的epoch加1,置为奇数。

处理OP_PROPOSE请求

void Elector::handle_propose(MonOpRequestRef op)
	MMonElection *m = static_cast<MMonElection*>(op->get_req());
	int from = m->get_source().num();
	// 对方的选举版本大于自己
	if (m->epoch > epoch)
		bump_epoch(m->epoch);
			epoch = e;
			t->put(Monitor::MONITOR_NAME, "election_epoch", epoch);
			mon->store->apply_transaction(t);
			mon->join_election();
			electing_me = false;
			acked_me.clear();
	// 对方的选举版本小于自己
	else if (m->epoch < epoch)
		if (epoch % 2 == 0 && mon->quorum.count(from) == 0)
			mon->start_election();
		else
			dout(5) << " ignoring old propose" << dendl;
			return;
	// 即使选举,也是本节点赢
	if (mon->rank < from)
		if (leader_acked >= 0)
			assert(leader_acked < from); 
		else
			if (!electing_me)
				mon->start_election();
	else
		if (leader_acked < 0 || leader_acked > from || leader_acked == from)
			// 对方会赢得选举
			defer(from);
				if (electing_me)
					acked_me.clear();
					electing_me = false;
				// ack them
				leader_acked = who;
				MMonElection *m = new MMonElection(MMonElection::OP_ACK, epoch, mon->monmap);
				m->mon_features = ceph::features::mon::get_supported();
				mon->collect_metadata(&m->metadata);
				mon->messenger->send_message(m, mon->monmap->get_inst(who));
				// set a timer
				reset_timer(1.0); 
		else
			dout(5) << "no, we already acked " << leader_acked << dendl;

其它节点收到OP_PROPOSE请求后,
(1) 如果对方的选举版本大于自己,则将自己的选举版本设置为对方的选举版本。
(2) 对方的选举版本小于自己,并且满足自己不处于选举阶段和对方不处于本节点的quorum缓存里,则说明对方可能是新加入的节点,这种情况下自己主动开启选举,以便让其加入到quorum。

如果没有忽略该请求,继续采取如下行为:
(1) 如果自己的rank小于对方,则一定不会选举对方为主节点,如果这时本节点没有回应过其它节点,则自己会发起选举。
(2) 如果对方的rank小于自己,并且对方的rank小于等于自己已经回应过的节点,则选举对方为主节点。

处理ACK

void Elector::handle_ack(MonOpRequestRef op)
	// 本节点发起的选举请求,要求选本节点
	if (electing_me) 
		acked_me[from].cluster_features = m->get_connection()->get_features();
		acked_me[from].mon_features = m->mon_features;
		acked_me[from].metadata = m->metadata;
		// 要求monmap中的全部节点都同意我作为leader才可以
		if (acked_me.size() == mon->monmap->size()) 
			victory();	
		assert(leader_acked >= 0);

收到OP_ACK后,将回应的节点插入到acked_me中,如果acked_me的大小和monmap的大小相同,则说明全部节点都同意我作为主节点。

victory()
	bump_epoch(epoch+1);     // is over!  偶数结束
	for (map<int, elector_info_t>::iterator p = acked_me.begin(); p != acked_me.end(); ++p) {
		quorum.insert(p->first);
	for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p) 
		if (*p == mon->rank) continue;
		MMonElection *m = new MMonElection(MMonElection::OP_VICTORY, epoch, mon->monmap);
		m->quorum = quorum;
		mon->messenger->send_message(m, mon->monmap->get_inst(*p));
	mon->win_election(epoch, quorum, cluster_features, mon_features, metadata);
		state = STATE_LEADER;
		leader_since = ceph_clock_now();
		leader = rank;
		quorum = active;
		quorum_con_features = features;
		quorum_mon_features = mon_features;
		pending_metadata = metadata;
		outside_quorum.clear();		
		paxos->leader_init();	

(1) 将选举版本加1,变为偶数。
(2) 将acked_me的节点赋值给quorum。
(3) 对quorum的每个节点,发送OP_VICTORY消息。
(4) 初始化paxos,主要是提交上次没有提交的消息。

leader_init	
	pending_proposal.reset();
	state = STATE_RECOVERING;
	lease_expire = utime_t();
	collect(0); // paxos里的函数
		// 有为确认的消息
		if (get_store()->exists(get_name(), last_committed+1))
			version_t v = get_store()->get(get_name(), "pending_v");
			version_t pn = get_store()->get(get_name(), "pending_pn");
			if (v && pn && v == last_committed + 1)
				uncommitted_pn = pn;
			else
				uncommitted_pn = accepted_pn;
			uncommitted_v = last_committed+1;
			get_store()->get(get_name(), last_committed+1, uncommitted_value);
		// 生成新的accepted_pn,只有在每次选举成功才能生成新的
		accepted_pn = get_new_proposal_number(std::max(accepted_pn, oldpn));
		accepted_pn_from = last_committed;
		for (set<int>::const_iterator p = mon->get_quorum().begin(); p != mon->get_quorum().end(); ++p)
			if (*p == mon->rank) continue;
			MMonPaxos *collect = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COLLECT,  ceph_clock_now());
			collect->last_committed = last_committed;
			collect->first_committed = first_committed;
			collect->pn = accepted_pn;
			mon->messenger->send_message(collect, mon->monmap->get_inst(*p));	

(1) 如果存在last_committed+1版本的日志,则说明存在没有完成同步的消息,则获取pending_v(未提交日志的版本号)、pending_pn(为提交日志的accepted_pn)、和last_committed+1的日志
(2) 生成新的accepted_pn,该值只在每次选举完成后重新生成。
(3) 向quorum其它节点发送OP_COLLECT消息。

副节点处理OP_COLLECT消息

void Paxos::handle_collect(MonOpRequestRef op)
	state = STATE_RECOVERING;
	MMonPaxos *last = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_LAST,  ceph_clock_now());
	last->last_committed = last_committed;
	last->first_committed = first_committed;
	version_t previous_pn = accepted_pn;
	// 接受对方的accepted_pn
	if (collect->pn > accepted_pn)
		accepted_pn = collect->pn;
		accepted_pn_from = collect->pn_from;
		t->put(get_name(), "accepted_pn", accepted_pn);
		get_store()->apply_transaction(t);
	last->pn = accepted_pn;
	last->pn_from = accepted_pn_from;
	
	if (collect->last_committed < last_committed)
		share_state(last, collect->first_committed, collect->last_committed);
			version_t v = peer_last_committed + 1;
			for ( ; v <= last_committed; v++) 
				get_store()->get(get_name(), v, m->values[v]);
			m->last_committed = last_committed;
	
	if (collect->last_committed <= last_committed && get_store()->exists(get_name(), last_committed+1))
		get_store()->get(get_name(), last_committed+1, bl);
		last->values[last_committed+1] = bl;
		version_t v = get_store()->get(get_name(), "pending_v");
		version_t pn = get_store()->get(get_name(), "pending_pn");
		if (v && pn && v == last_committed + 1)
			last->uncommitted_pn = pn;
		else
			last->uncommitted_pn = previous_pn;
	collect->get_connection()->send_message(last);

副节点收到OP_COLLECT消息后,如果主节点的的last_committed小于自己的,说明主节点缺失部分日志,就将缺失的这段日志分享给主节点,如果自己有未确认的日志,则一并发送给对方。

主节点处理OP_LAST消息

void Paxos::handle_last(MonOpRequestRef op)
	peer_first_committed[from] = last->first_committed;
	peer_last_committed[from] = last->last_committed;
	// 如果对方的日志更新于本节点,则在本节点持久化缺失的日志
	need_refresh = store_state(last);
	// 之所以每次副节点的LAST消息到来都要遍历peer_last_committed,因此每次有新的LAST消息到来都可能会改变
	// 本节点的日志,所以每次都需要和LAST消息已被接收的节点比较,如果对方的日志太旧,则更新
	for (map<int,version_t>::iterator p = peer_last_committed.begin(); p != peer_last_committed.end(); ++p) 
		// 对方收到OP_PROBE时,检测到自己的版本太落后,会bootstrap
		if (p->second + 1 < first_committed && first_committed > 1)
			mon->bootstrap();
			return;
		// 对方的日志不是太落后,则直接在此更新
		if (p->second < last_committed)
			MMonPaxos *commit = new MMonPaxos(mon->get_epoch(), MMonPaxos::OP_COMMIT, ceph_clock_now());
			share_state(commit, peer_first_committed[p->first], p->second);
			mon->messenger->send_message(commit, mon->monmap->get_inst(p->first));
	if (last->pn > accepted_pn)
		collect(last->pn);
	else if (last->pn == accepted_pn)
		num_last++;
		if (last->uncommitted_pn)
			if (last->uncommitted_pn >= uncommitted_pn && last->last_committed >= last_committed && last->last_committed + 1 >= uncommitted_v)
				uncommitted_v = last->last_committed+1;
				uncommitted_pn = last->uncommitted_pn;
				uncommitted_value = last->values[uncommitted_v];
		if (num_last == mon->get_quorum().size())
			if (uncommitted_v == last_committed+1 && uncommitted_value.length())
				state = STATE_UPDATING_PREVIOUS;
				begin(uncommitted_value);
			else
				extend_lease();

主节点收到OP_LAST消息后,如果副节点的日志比自己新,则更新自己的日志。并遍历peer_last_committed,对比本节点更新后的日志和其它节点的日志,如果其它节点的日志过于落后(last_committed小于first_committed),本节点重新调用bootstrap,bootstrap会发送OP_PROBE消息,在对方收到OP_PROBE消息时,检测到自己的日志太过落后,就会主动调用bootstrap。如果peer_last_committed中其它节点的日志稍微落后于本节点,就主动将缺失的日志发送给对方,不需要重新调用bootstrap。

如果副节点的的accepted_pn大于主节点的accepted_pn,则主节点重新collect,会重新生成新的accepted_pn。
如果副节点的accepted_pn等于本节点的accepted_pn,则说明副节点接受的本节点的accepted_pn。判断副节点是否有未提交并且版本大于主节点未提交的日志,如果有,则将这个未提交的日志广播出去,这是通过正常的paxos过程实现。如果没有未提交的日志,就调用extend_lease扩展副本的租约。

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值