gfsh>start server --name=server1 --locators=127.0.0.1[10334] --server-port=0
Starting a Geode Server in C:\tools\apache-geode-1.7.0\bin\server1...
The Cache Server process terminated unexpectedly with exit status 1. Please refer to the log file in C:\tools\apache-geode-1.7.0\bin\server1 for full details.
Exception in thread "main" org.apache.geode.SystemConnectException: Unable to join the distributed system in 60071ms
at org.apache.geode.distributed.internal.membership.gms.membership.GMSJoinLeave.join(GMSJoinLeave.java:391)
at org.apache.geode.distributed.internal.membership.gms.mgr.GMSMembershipManager.join(GMSMembershipManager.java:656)
at org.apache.geode.distributed.internal.membership.gms.mgr.GMSMembershipManager.joinDistributedSystem(GMSMembershipManager.java:745)
at org.apache.geode.distributed.internal.membership.gms.Services.start(Services.java:191)
at org.apache.geode.distributed.internal.membership.gms.GMSMemberFactory.newMembershipManager(GMSMemberFactory.java:106)
at org.apache.geode.distributed.internal.membership.MemberFactory.newMembershipManager(MemberFactory.java:90)
at org.apache.geode.distributed.internal.ClusterDistributionManager.<init>(ClusterDistributionManager.java:1042)
at org.apache.geode.distributed.internal.ClusterDistributionManager.<init>(ClusterDistributionManager.java:1076)
at org.apache.geode.distributed.internal.ClusterDistributionManager.create(ClusterDistributionManager.java:562)
at org.apache.geode.distributed.internal.InternalDistributedSystem.initialize(InternalDistributedSystem.java:763)
at org.apache.geode.distributed.internal.InternalDistributedSystem.newInstance(InternalDistributedSystem.java:355)
at org.apache.geode.distributed.internal.InternalDistributedSystem.newInstance(InternalDistributedSystem.java:343)
at org.apache.geode.distributed.internal.InternalDistributedSystem.newInstance(InternalDistributedSystem.java:335)
at org.apache.geode.distributed.DistributedSystem.connect(DistributedSystem.java:211)
at org.apache.geode.cache.CacheFactory.create(CacheFactory.java:219)
at org.apache.geode.distributed.internal.DefaultServerLauncherCacheProvider.createCache(DefaultServerLauncherCacheProvider.java:52)
at org.apache.geode.distributed.ServerLauncher.createCache(ServerLauncher.java:856)
at org.apache.geode.distributed.ServerLauncher.start(ServerLauncher.java:774)
at org.apache.geode.distributed.ServerLauncher.run(ServerLauncher.java:704)
at org.apache.geode.distributed.ServerLauncher.main(ServerLauncher.java:224)
在启动server的时候出现这个错误, 从这个error stack里面能看到启动一个server要执行的步骤
-
ServerLauncher.createCache
-
DefaultServerLauncherCacheProvider.createCache
-
CacheFactory.create
-
DistributedSystem.connect
// Make a new connection to the distributed system
InternalDistributedSystem newSystem = InternalDistributedSystem.newInstance(config);
addSystem(newSystem);
这里面是创建一个新的系统然后加进集群里面,问题就出在这个创建的过程 -
this.dm = ClusterDistributionManager.create(this); 创建ClusterDistributionManager
-
distributionManager = new ClusterDistributionManager(system, transport); new实例化
这个构造函数创建了很多线程池,我们来看一下都有哪些
Serial Message Processor = serialThread
View Message Processor = viewThread
Pooled Message Processor = threadPool
Pooled High Priority Message Processor = highPriorityPool
Pooled Waiting Message Processor = waitingPool
PrMetaData cleanup Message Processor = prMetaDataCleanupThreadPool
PartitionedRegion Message Processor = partitionedRegionPool
Function Execution Processor = functionExecutionPool || functionExecutionThread
memberEventThread
membershipManager = MemberFactory.newMembershipManager(l, system.getConfig(), transport,
stats, system.getSecurityService());
// Allow events to start being processed.
membershipManager.startEventProcessing(); //Line 1071
for (;;) {
this.getCancelCriterion().checkCancelInProgress(null);
boolean interrupted = Thread.interrupted();
try {
membershipManager.waitForEventProcessing(); //Line 1076
break;
} catch (InterruptedException e) {
interrupted = true;
} finally {
if (interrupted) {
Thread.currentThread().interrupt();
}
}
}
- GMSMemberFactory.newMembershipManager
- Services.start
有一段加入集群的代码
try {
this.manager.joinDistributedSystem();
} catch (Throwable e) {
stop();
throw e;
}
- GMSMembershipManager.joinDistributedSystem
- GMSMembershipManager.join
boolean ok = services.getJoinLeave().join();
- GMSJoinLeave.join
下面来看一下这个join的代码, “Unable to join the distributed system” 这个错误消息就是这段代码抛出来的
//TODO 未完待续
/**
* attempt to join the distributed system loop send a join request to a locator & get a response
* <p>
* If the response indicates there's no coordinator it will contain a set of members that have
* recently contacted it. The "oldest" member is selected as the coordinator based on ID sort
* order.
*
* @return true if successful, false if not
*/
public boolean join() {
try {
if (Boolean.getBoolean(BYPASS_DISCOVERY_PROPERTY)) {
synchronized (viewInstallationLock) {
becomeCoordinator();
}
return true;
}
SearchState state = searchState;
long locatorWaitTime = ((long) services.getConfig().getLocatorWaitTime()) * 1000L;
long timeout = services.getConfig().getJoinTimeout();
logger.debug("join timeout is set to {}", timeout);
long retrySleep = JOIN_RETRY_SLEEP;
long startTime = System.currentTimeMillis();
long locatorGiveUpTime = startTime + locatorWaitTime;
long giveupTime = startTime + timeout;
for (int tries = 0; !this.isJoined && !this.isStopping; tries++) {
logger.debug("searching for the membership coordinator");
boolean found = findCoordinator(); //查找协调者
logger.debug("state after looking for membership coordinator is {}", state);
if (found) {
logger.debug("found possible coordinator {}", state.possibleCoordinator);
if (localAddress.getNetMember().preferredForCoordinator()
&& state.possibleCoordinator.equals(this.localAddress)) {
if (tries > 2 || System.currentTimeMillis() < giveupTime) {
synchronized (viewInstallationLock) {
becomeCoordinator();
}
return true;
}
} else {
if (attemptToJoin()) {
return true;
}
if (!state.possibleCoordinator.equals(localAddress)) {
state.alreadyTried.add(state.possibleCoordinator);
}
if (System.currentTimeMillis() > giveupTime) {
break;
}
}
} else {
long now = System.currentTimeMillis();
if (state.locatorsContacted <= 0) {
if (now > locatorGiveUpTime) {
// break out of the loop and return false
break;
}
tries = 0;
giveupTime = now + timeout;
} else if (now > giveupTime) {
break;
}
}
try {
if (found && !state.hasContactedAJoinedLocator) {
// if locators are restarting they may be handing out IDs from a stale view that
// we should go through quickly. Otherwise we should sleep a bit to let failure
// detection select a new coordinator
if (state.possibleCoordinator.getVmViewId() < 0) {
logger.debug("sleeping for {} before making another attempt to find the coordinator",
retrySleep);
Thread.sleep(retrySleep);
}
// since we were given a coordinator that couldn't be used we should keep trying
tries = 0;
giveupTime = System.currentTimeMillis() + timeout;
}
} catch (InterruptedException e) {
logger.debug("retry sleep interrupted - giving up on joining the distributed system");
return false;
}
} // for
if (!this.isJoined) {
logger.debug("giving up attempting to join the distributed system after "
+ (System.currentTimeMillis() - startTime) + "ms");
}
// to preserve old behavior we need to throw a SystemConnectException if
// unable to contact any of the locators
if (!this.isJoined && state.hasContactedAJoinedLocator) {
throw new SystemConnectException("Unable to join the distributed system in "
+ (System.currentTimeMillis() - startTime) + "ms");
}
return this.isJoined;
} finally {
// notify anyone waiting on the address to be completed
if (this.isJoined) {
synchronized (this.localAddress) {
this.localAddress.notifyAll();
}
}
searchState.cleanup();
}
}
通过日志文件看到大量的错误, 在我们使用的命令里面指定server的端口是0
[error 2018/11/20 11:47:24.300 CST server1 <Geode UDP Timer-2,host-2531> tid=0x1a] Exception caught while sending message
java.net.BindException: Cannot assign requested address: Datagram send failed
at java.net.TwoStacksPlainDatagramSocketImpl.send(Native Method)
at java.net.DatagramSocket.send(DatagramSocket.java:693)
at org.jgroups.protocols.UDP._send(UDP.java:224)
at org.jgroups.protocols.UDP.sendUnicast(UDP.java:215)
at org.jgroups.protocols.TP.sendToSingleMember(TP.java:1974)
at org.jgroups.protocols.TP.doSend(TP.java:1962)
at org.apache.geode.distributed.internal.membership.gms.messenger.Transport.doSend(Transport.java:78)
at org.jgroups.protocols.TP.send(TP.java:1948)
at org.apache.geode.distributed.internal.membership.gms.messenger.Transport._send(Transport.java:50)
at org.jgroups.protocols.TP.down(TP.java:1515)
at org.jgroups.stack.Protocol.down(Protocol.java:439)
at org.apache.geode.distributed.internal.membership.gms.messenger.StatRecorder.down(StatRecorder.java:88)
at org.jgroups.protocols.UNICAST3.retransmit(UNICAST3.java:753)
at org.jgroups.protocols.UNICAST3.triggerXmit(UNICAST3.java:1559)
at org.jgroups.protocols.UNICAST3$RetransmitTask.run(UNICAST3.java:1509)
at org.jgroups.util.TimeScheduler3$Task.run(TimeScheduler3.java:291)
at org.jgroups.util.TimeScheduler3$RecurringTask.run(TimeScheduler3.java:325)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)