1.yarn,两个ResourceManager状态都为 standby
错误信息:
org.apache.hadoop.ha.ServiceFailedException: RM could not transition to Active
2022-08-10 20:04:19,009 INFO org.apache.hadoop.yarn.event.AsyncDispatcher: AsyncDispatcher is draining to stop, ignoring any new events.
2022-08-10 20:04:19,009 INFO org.apache.hadoop.service.AbstractService: Service RMActiveServices failed in state STARTED
java.lang.IllegalArgumentException: java.net.UnknownHostException: king
at org.apache.hadoop.security.SecurityUtil.buildTokenService(SecurityUtil.java:447)
at org.apache.hadoop.hdfs.NameNodeProxiesClient.createProxyWithClientProtocol(NameNodeProxiesClient.java:139)
at org.apache.hadoop.hdfs.DFSClient.<init>(DFSClient.java:356)
at org.apache.hadoop.hdfs.DFSClient.<init>(DFSClient.java:290)
at org.apache.hadoop.hdfs.DistributedFileSystem.initialize(DistributedFileSystem.java:171)
at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:3303)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:476)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:227)
at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:463)
at org.apache.hadoop.fs.Path.getFileSystem(Path.java:365)
at org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore.startInternal(FileSystemRMStateStore.java:155)
at org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.serviceStart(RMStateStore.java:746)
at org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)
at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$RMActiveServices.serviceStart(ResourceManager.java:868)
at org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)
at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.startActiveServices(ResourceManager.java:1262)
at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$1.run(ResourceManager.java:1303)
at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager$1.run(ResourceManager.java:1299)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1730)
at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.transitionToActive(ResourceManager.java:1299)
at org.apache.hadoop.yarn.server.resourcemanager.AdminService.transitionToActive(AdminService.java:327)
at org.apache.hadoop.yarn.server.resourcemanager.ActiveStandbyElectorBasedElectorService.becomeActive(ActiveStandbyElectorBasedElectorService.java:144)
at org.apache.hadoop.ha.ActiveStandbyElector.becomeActive(ActiveStandbyElector.java:896)
at org.apache.hadoop.ha.ActiveStandbyElector.processResult(ActiveStandbyElector.java:476)
at org.apache.zookeeper.ClientCnxn$EventThread.processEvent(ClientCnxn.java:610)
at org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:508)
Caused by: java.net.UnknownHostException: king
问题源于yarn-site.xml文件中配置的问题,导致无法在zookeeper集群存储节点状态,增加下面配置后重启
<!--指定resourcemanager的状态信息存储在zookeeper集群-->
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
</property>
重启后运行正常