集群初始化过程中在GridCachePartitionExchangeManager中启动exchange-worker线程
new IgniteThread(cctx.igniteInstanceName(), "exchange-worker", exchWorker).start();
新加入节点激活集群时(例如ignite.cluster().active(true))会调用GridClusterStateProcessor.changeGlobalState方法
GridDiscoveryManager.sendCustomEvent(msg),msg是封装的ChangeGlobalStateMessage
在MessageWork将msg加入自己阻塞队列queue中
MessageWork通过ServerImpl.sendMessageAcrossRing(msg)将信息发送到集群构成的环中
DiscoveryWorker("disco-event-worker")监听到发现事件将GridDhtPartitionsExchangeFuture加入自己的阻塞队列futQ
ExchangeWorker从futQ中取出元素作为CachePartitionExchangeWorkerTask task
ExchangeWorker{
exchFut.init(newCrd);//exchFut即为上面的task,如果当前节点在exchange后变为coordinator时newCrd设置为true
if(firstDiscoEvt.type()== EVT_DISCOVERY_CUSTOM_EVT{
GridDhtPartitionsExchangeFuture.onClusterStateChangeRequest(crdNode);//crdNode为false表示当前节点不是协调器节点----------p13
//在当前节点上启动缓存 Cache actions
List<ExchangeActions.CacheActionData> startReqList = exchActions.cacheStartRequests();//获得缓存启动数据
List<DynamicCacheDescriptor> startDescs = startReqList.DynamicCacheDescriptor()//伪代码,获得缓存启动描述器
GridCacheDatabaseSharedManager.readCheckpointAndRestoreMemory(startDescs);//获得检查点读锁,并恢复缓存
registerCachesFuture = CacheAffinitySharedManager.onCacheChangeRequest(this, crd, exchActions);//this为GridDhtPartitionsExchangeFuture对象,crd是否为协调器,exchActions为缓存行为对象
CachesRegistry.registerAllCachesAndGroups(grpDescs, cacheDescs);//注册缓存组描述器和动态缓存描述器
CacheAffinitySharedManager.processCacheStartRequests(fut,crd,exchActions);//处理缓存启动请求 p20
GridCacheProcessor.startCacheGroup(req.startCacheConfiguration(),cacheDesc,nearCfg,evts.topologyVersion(),req.disabledAfterStart());//启动缓存组
GridCacheContext cacheCtx = createCache(ccfg,grp,null,desc,exchTopVer,cacheObjCtx,affNode,true,disabledAfterStart);//创建缓存
GridCacheProcessor.startCache(cache,desc.schema()!= null?desc.schema():new QuerySchema());//启动创建的缓存
//Affinity actions
CacheAffinitySharedManager.initAffinity(cachesRegistry.group(grp.groupId()),grp.affinity(),fut);//初始化affinity
List<List<ClusterNode>> assignment = GridAffinityAssignmentCache.calculate(topVer, evts, evts.discoveryCache());//获得缓存分区到节点的分配
List<List<ClusterNode>> RendezvousAffinityFunction.assignPartitions(AffinityFunctionContext affCtx);//默认用RendezvousAffinityFunction进行分配
List<ClusterNode> RendezvousAffinityFunction.assignPartition(int part,List<ClusterNode> nodes,int backups,@Nullable Map<UUID, Collection<ClusterNode>> neighborhoodCache);//对每个分区进行分配
GridAffinityAssignmentCache.initialize(topVer, assignment);//初始化affinity根据给定拓扑版本和分配
//接p20 处理缓存结束请求
CacheAffinitySharedManager.processCacheStopRequests(GridDhtPartitionsExchangeFuture fut,boolean crd,final ExchangeActions exchActions,boolean forceClose);
}else if(firstDiscoEvt.type()== EVT_NODE_JOINED){
registerCachesFuture = GridDhtPartitionsExchangeFuture.initCachesOnLocalJoin();
CacheAffinitySharedManager.initCachesOnLocalJoin(locJoinCtx.cacheGroupDescriptors(),locJoinCtx.cacheDescriptors());//使用缓存组描述器s和缓存描述器s初始化缓存
CachesRegistry.registerAllCachesAndGroups(grpDescs, cacheDescs);//注册缓存组描述器和动态缓存描述器
GridCacheProcessor.prepareCacheStart(desc.cacheConfiguration(),desc,t.get2(),exchTopVer,false);//开始启动缓存
GridCacheProcessor.startCacheGroup(desc.groupDescriptor(),desc.cacheType(),affNode,cacheObjCtx,exchTopVer);//启动缓存组
GridCacheContext cacheCtx = createCache(ccfg,grp,null,desc,exchTopVer,cacheObjCtx,affNode,true,disabledAfterStart);//创建缓存
GridCacheProcessor.startCache(cache,desc.schema()!= null?desc.schema():new QuerySchema());//启动创建的缓存
}
//接p13
GridDhtPartitionsExchangeFuture.distributedExchange();
CacheGroupContext.preloader().onTopologyChanged(this);
IgniteCacheDatabaseSharedManager.reserveHistoryForExchange();//要在启用持久性时正确地重新平衡,有必要在交换中保留历史记录
GridDhtPartitionsExchangeFuture.waitPartitionRelease(distributed,true);//我们等待所有节点完成本地事务更新,原子更新和锁释放
GridDhtPartitionsExchangeFuture.waitPartitionRelease(false, false);//等待完成剩余的从主节点到备份节点的所有事务更新
IgniteCacheDatabaseSharedManager.beforeExchange(this);//必须在所有拓扑回调之前运行数据库回调。如果启用了持久存储,则首先还原磁盘上显示的分区
GridDhtPartitionTopologyImpl.beforeExchange( GridDhtPartitionsExchangeFuture exchFut,boolean affReady,boolean updateMoving);
GridDhtPartitionTopologyImpl.initPartitions(affVer, affAssignment, exchFut, updateSeq);//用给定的拓扑版本和aff分配创建和初始化分区
IgniteCacheDatabaseSharedManager.onStateRestored();//当所有分区都已完全还原并在节点启动预先创建时调用
//发送Single message
GridDhtPartitionsExchangeFuture.sendPartitions(crd);//crd为协调器节点,向协调器发送本地分区信息
GridCacheIoManager.send(node, msg, SYSTEM_POOL);//msg为创建的GridDhtPartitionsSingleMessage,包含了单个节点分区信息
GridNioServer.AbstractNioClientWorker.offer((SessionChangeRequest)req);//底层使用NIO进行封装发送到协调器节点
}
sys-#43{
//当前节点收到协调器节点发送的GridDhtPartitionsFullMessage
GridIoManager.processRegularMessage0(msg,nodeId);//msg是收到的GridDhtPartitionsFullMessage,nodeId为协调器节点UUID
GridCacheIoManager.processMessage(UUID nodeId,GridCacheMessage msg,IgniteBiInClosure <UUID,GridCacheMessage> c);
GridCachePartitionExchangeManager.processFullPartitionUpdate(node,msg);//处理全局的分区更新
GridDhtPartitionsExchangeFuture.onReceiveFullMessage(final ClusterNode node, final GridDhtPartitionsFullMessage msg);//监听到全局分区改变消息
GridDhtPartitionsExchangeFuture.processFullMessage(true, node, msg);
if(如果是本地节点加入导致分区变化执行该操作){
CacheAffinitySharedManager.onLocalJoin(this, msg, resTopVer);
CacheAffinitySharedManager.forAllCacheGroups(boolean crd, IgniteInClosureX<GridAffinityAssignmentCache> c);//对于所有的缓存组执行指定的闭包
CacheGroupAffinityMessage affMsg = receivedAff.get(aff.groupId());//获得缓存组的分区分配
List<List<ClusterNode>> assignments = affMsg.createAssignments(nodesByOrder, evts.discoveryCache());//根据初始的idealAssigns和收到的discoCache计算理想的分区分配
aff.calculate(evts.topologyVersion(), evts, evts.discoveryCache());//assignments为null,需要重新计算理想的分区分配根据交换事件中的信息
List<List<ClusterNode>> RendezvousAffinityFunction.assignPartitions(AffinityFunctionContext affCtx);//默认用RendezvousAffinityFunction进行分配
List<ClusterNode> RendezvousAffinityFunction.assignPartition(int part,List<ClusterNode> nodes,int backups,@Nullable Map<UUID, Collection<ClusterNode>> neighborhoodCache);//对每个分区进行分配
CacheAffinitySharedManager.initialize(evts.topologyVersion(), assignments);//初始化affinity根据给定拓扑版本和分配
GridDhtPartitionTopologyImpl.initPartitions(affVer, affAssignment, exchFut, updateSeq);//用给定的拓扑版本和aff分配创建和初始化分区
}else if(根据实际的分区分布强制执行亲和力重新分配){
CacheAffinitySharedManager.applyAffinityFromFullMessage(this, msg);//从收到的全局消息中应用亲和力差异
forAllCacheGroups(false, new IgniteInClosureX<GridAffinityAssignmentCache>());
Map<Integer, CacheGroupAffinityMessage> idealAffDiff = msg.idealAffinityDiff();//获得与理想分区的差异
List<List<ClusterNode>> idealAssignment = aff.calculate(evts.topologyVersion(), evts, evts.discoveryCache());//根据全局消息中的事件信息计算得到理想分配
CacheGroupAffinityMessage affMsg = idealAffDiff != null ? idealAffDiff.get(aff.groupId()) : null;//得到只有分区差异信息的分区消息
newAssignment.set(e.getKey(), CacheGroupAffinityMessage.toNodes(assign,nodesByOrder,evts.discoveryCache()));//根据差异信息调整分区分配
CacheAffinitySharedManager.initialize(evts.topologyVersion(), assignments);//初始化affinity根据给定拓扑版本和分配
}
GridDhtPartitionsExchangeFuture.updatePartitionFullMap(resTopVer, msg);//更新所有缓存的分区映射
partHistSuppliers.putAll(msg.partitionHistorySuppliers());//根据GridDhtPartitionsFullMessage给partHistSuppliers赋值
CachePartitionFullCountersMap cntrMap = msg.partitionUpdateCounters(grpId,grp.topology().partitions());//获得更新的分区数量
grp.topology().update(resTopVer,entry.getValue(),cntrMap,msg.partsToReload(cctx.localNodeId(), grpId),msg.partitionSizes(grpId),null);//更新拓扑
cntrMap.updateCounter(i, incomeCntrMap.updateCounter(i));//更新分区计数器
AffinityAssignment aff = grp.affinity().readyAffinity(readyTopVer);//获得目前的缓存全局分区信息
updateRebalanceVersion(aff.topologyVersion(), aff.assignment());//更新rebalance版本
GridDhtPartitionsExchangeFuture.onDone(resTopVer,err);//完成全局变化信息处理
detectLostPartitions(res);//检查丢失的分区,res为目前的AffinityTopologyVersion
processCacheStopRequestOnExchangeDone(exchActions);//如果没有丢失分区,处理缓存结束请求
ExchangeActions.cacheStopRequests();//停止缓存的请求
}
协调器节点视角 (当有节点加入时)
sys-#**{
...
GridCacheIoManager.processMessage(nodeId, cacheMsg, c);//处理收到的GridDhtPartitionsSingleMessage消息
GridCachePartitionExchangeManager.processSinglePartitionUpdate(node, msg);
exchFut.onReceiveSingleMessage(node, msg);//处理单个节点的分区更新
processSingleMessage(node.id(), msg);
GridDhtPartitionsExchangeFuture.updatePartitionSingleMap(UUID nodeId,GridDhtPartitionsSingleMessage msg);//更新分区拓扑
onAllReceived(null);
GridCachePartitionExchangeManager.mergeExchangesOnCoordinator(this);//合并分区更改事件
finishExchangeOnCoordinator(sndResNodes);//p115
WaitRebalanceInfo waitRebalanceInfo = initAffinityOnNodeJoin(fut, crd);//初始化affinity
initAffinityOnNodeJoin(evts,grpAdded,cache.affinity(),waitRebalanceInfo,latePrimary,affCache);
List<List<ClusterNode>> idealAssignment = aff.calculate(evts.topologyVersion(), evts, evts.discoveryCache());//根据节点消息中的事件信息计算得到理想分配
aff.initialize(evts.topologyVersion(), cachedAssignment(aff, newAssignment, affCache));//使用给定的拓扑版本和分配初始化亲和力
top.beforeExchange(this, true, true);//预初始化拓扑
GridDhtPartitionTopologyImpl.initPartitions(affVer, affAssignment, exchFut, updateSeq);//用给定的拓扑版本和aff分配创建和初始化分区
createPartitions(affVer, affAssignment, updateSeq);//根据affinity创建不存在的分区
updateRebalanceVersion(affVer,affAssignment);//更新rebalance的版本rebalancedTopVer
createMovingPartitions(grp.affinity().readyAffinity(evts.topologyVersion()));//创建移动的分区
CachePartitionPartialCountersMap cntrs = msg.partitionUpdateCounters(grpId,top.partitions());//根据收到的single msg 更新分区计数器
GridDhtPartitionsExchangeFuture.assignPartitionsStates();//分配分区的状态
GridDhtPartitionsFullMessage msg = createPartitionsMessage(true,minVer.compareToIgnoreTimestamp(PARTIAL_COUNTERS_MAP_SINCE) >= 0);//创建full msg
m.addPartitionUpdateCounters(grp.groupId(), cntrsMap);//给full msg增加分区计数器 partCntrs2
addFullPartitionsMap(m,dupData,compress,grp.groupId(),locMap,affCache.similarAffinityKey());//为msg添加全局分区映射 parts和dupPartsData
m.addPartitionSizes(grp.groupId(), grp.topology().globalPartSizes());//msg添加分区大小信息partsSizes
msg.resultTopologyVersion(resTopVer);//更新msg的拓扑版本
msg.prepareMarshal(cctx);//对msg进行序列化
sendAllPartitions(msg, nodes, mergedJoinExchMsgs0, joinedNodeAff);//mergedJoinExchMsgs0为多个节点加入产生single msg的merge,将full msg发送给非协调器节点
cctx.io().send(node, fullMsgToSend, SYSTEM_POOL);
onDone(exchCtx.events().topologyVersion(), null);
}
ExchangeWorker{
exchFut.init(newCrd);
if (firstDiscoEvt.type() == EVT_NODE_JOINED) {
if(!firstDiscoEvt.eventNode().isLocal()){
GridCacheProcessor.startReceivedCaches(firstDiscoEvt.eventNode().id(),topVer);//启动交换期间从远程节点接收的静态配置的缓存
CacheAffinitySharedManager.initStartedCaches(crdNode, this, receivedCaches);//初始化亲和力
updateTopologies(crdNode, cctx.coordinators().currentCoordinator());//更新所有拓扑上的拓扑版本和发现缓存
}else{
registerCachesFuture = GridDhtPartitionsExchangeFuture.initCachesOnLocalJoin();
CacheAffinitySharedManager.initCachesOnLocalJoin(locJoinCtx.cacheGroupDescriptors(),locJoinCtx.cacheDescriptors());//使用缓存组描述器s和缓存描述器s初始化缓存
CachesRegistry.registerAllCachesAndGroups(grpDescs, cacheDescs);//注册缓存组描述器和动态缓存描述器
GridCacheProcessor.prepareCacheStart(desc.cacheConfiguration(),desc,t.get2(),exchTopVer,false);//开始启动缓存
}
}else if(firstDiscoEvt.type() == EVT_DISCOVERY_CUSTOM_EVT){
cctx.affinity().onCentralizedAffinityChange(this, crdNode);//由服务器节点离开或自定义事件(具有集中的相似性分配)发起的交换调用
forAllRegisteredCacheGroups(new IgniteInClosureX<CacheGroupDescriptor>() c);//对所有已经注册的缓存执行b包中操作
cache.aff.calculate(fut.initialVersion(), fut.events(), fut.events().discoveryCache());//计算affinity
//discoCache.state().baselineTopology().equals(baselineTopology)为false时重新分配分区,利用Wang/Jenkins hash对每个分区进行节点排序,前(1+backups)被记录下来
aff.assignPartitions(new GridAffinityFunctionContextImpl(discoCache.state().baselineTopology().createBaselineView(sorted, nodeFilter),
prevAssignment, events.lastEvent(), topVer, backups));
updateTopologies(crdNode, cctx.coordinators().currentCoordinator());更新所有拓扑上的拓扑版本和发现缓存
}
GridDhtPartitionsExchangeFuture.distributedExchange();
grp.preloader().onTopologyChanged(this);
IgniteCacheDatabaseSharedManager.reserveHistoryForExchange();//要在启用持久性时正确地重新平衡,有必要在交换中保留历史记录
GridDhtPartitionsExchangeFuture.waitPartitionRelease(distributed,true);//我们等待所有节点完成本地事务更新,原子更新和锁释放
GridDhtPartitionsExchangeFuture.waitPartitionRelease(false, false);//等待完成剩余的从主节点到备份节点的所有事务更新
IgniteCacheDatabaseSharedManager.beforeExchange(this);//必须在所有拓扑回调之前运行数据库回调。如果启用了持久存储,则首先还原磁盘上显示的分区
GridDhtPartitionTopologyImpl.beforeExchange( GridDhtPartitionsExchangeFuture exchFut,boolean affReady,boolean updateMoving);
GridDhtPartitionTopologyImpl.initPartitions(affVer, affAssignment, exchFut, updateSeq);//用给定的拓扑版本和aff分配创建和初始化分区
GridDhtPartitionTopologyImpl.createPartitions(affVer, affAssignment, updateSeq);//根据affinity创建不存在的分区
updateSeq = GridDhtPartitionTopologyImpl.updateLocal(p, locPart.state(), updateSeq, affVer);//更新本地node2part映射中的分区状态,并重新计算diffFromAffinity
updateRebalanceVersion(affVer,affAssignment);//更新rebalance的版本rebalancedTopVer
if(!crd.isLocal()){//当前节点是协调器节点,不发送single message
// Single message
}
GridDhtPartitionsExchangeFuture.initDone();//回头通知future已经完成
GridDhtPartitionsExchangeFuture.processSingleMessage(node.id(),msg);//处理单个分区消息
GridDhtPartitionsExchangeFuture.onAllReceived(Collection <ClusterNode> sndResNodes);
GridDhtPartitionsExchangeFuture.finishExchangeOnCoordinator(Collection <ClusterNode> sndResNodes);//在协调器节点完成分区变更
cctx.affinity().onServerJoinWithExchangeMergeProtocol(this, true);
forAllRegisteredCacheGroups(new IgniteInClosureX<CacheGroupDescriptor>() c);//对每个注册的分区执行闭包
initAffinityOnNodeJoin(evts,grpAdded,cache.affinity(),waitRebalanceInfo,latePrimary,affCache);//初始化affinity
aff.initialize(evts.topologyVersion(), cachedAssignment(aff, newAssignment, affCache));//使用给定的拓扑版本和分配初始化亲和力
GridDhtPartitionsFullMessage msg = createPartitionsMessage(true,minVer.compareToIgnoreTimestamp(PARTIAL_COUNTERS_MAP_SINCE)> = 0);//生成full message
m.addPartitionUpdateCounters(grp.groupId(), cntrsMap);//对msg添加分区计数器partCntrs2
m.addPartitionSizes(grp.groupId(), grp.topology().globalPartSizes());//对msg添加分区大小消息partsSizes
msg.prepareMarshal(cctx);//对full message进行序列化
sendAllPartitions(msg, nodes, mergedJoinExchMsgs0, joinedNodeAff);//发送full message 底层使用NIO封装发送出去,需要借助于GridNioServer.processWrite(key)
ConcurrentLinkedQueue<SessionChangeRequest> changeReqs.off(SessionChangeRequest req);//将full msg封装到SessionChangeRequest对象中
}
grid-nio-work-tcp-comm-0-*{
processSelectedKeysOptimized(selectedKeys.flip());//处理选择器选择的key
GridNioServer.processWrite(key);//在key上进行写就绪处理
finished = msg.writeTo(buf, writer);//根据key获得session,进而获得buffer和writer
writer.writeMessage("msg", msg);//写对应的嵌套消息
while ((req0 = changeReqs.poll()) != null){}//从changeReqs队列中取请求
}
tcp-disco-msg-worker-#*{
processMessage(msg);//msg为ChangeGlobalStateFinishMessage消息
}