一.前言
Hadoop HA主要解决HDFS NameCode&YARN ResourceManager单点故障问题,通过引入备用资源,在主资源发生故障时通过zk切换到备用资源。
二.配置
2.1 HDFS HA
注意:主备切换主要依赖两项配置:
1.能相互免密登录
2.dfs.ha.fencing.methods
2.1.1 vi hadoop-env.sh
#################
#added for HDFS ha
export HDFS_JOURNALNODE_USER=root
export HDFS_ZKFC_USER=root
2.1.2 vi core-site.xml
#################
<configuration>
<!--设置fs.defaultFS为nameservices的逻辑主机名!!!IMPORTANT-->
<property>
<name>fs.defaultFS</name>
<value>hdfs://mycluster</value>
</property>
<!--设置zookeeper位置信息-->
<property>
<name>ha.zookeeper.quorum</name>
<value>ipsnode1:2181,ipsnode2:2181,ipsnode3:2181</value>
</property>
</configuration>
2.1.3 hdfs-site.xml
#################
删除secondary的配置信息,secondary的功能将由standby namecode完成
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>hadoop2:50090</value>
</property>
新增/修改如下配置:
<configuration>
<!--定义nameservices逻辑名称-->
<property>
<name>dfs.nameservices</name>
<value>mycluster</value>
</property>
<!--映射nameservices逻辑名称到namenode逻辑名称-->
<property>
<name>dfs.ha.namenodes.mycluster</name>
<value>nn1,nn2</value>
</property>
<!--映射namenode逻辑名称到真实主机名称(RPC)-->
<property>
<name>dfs.namenode.rpc-address.mycluster.nn1</name>
<value>ipsnode1:8020</value>
</property>
<!--映射namenode逻辑名称到真实主机名称(RPC)-->
<property>
<name>dfs.namenode.rpc-address.mycluster.nn2</name>
<value>ipsnode2:8020</value>
</property>
<!--映射namenode逻辑名称到真实主机名称(HTTP) dfs.namenode.http-address.mycluster.nn1 skip-->
<property>
<name>dfs.namenode.http-address.mycluster.nn1</name>
<value>ipsnode1:50070</value>
</property>
<!--映射namenode逻辑名称到真实主机名称(HTTP) skip?????-->
<property>
<name>dfs.namenode.http-address.mycluster.nn2</name>
<value>ipsnode2:50070</value>
</property>
<!--配置journalnode集群位置信息及目录-->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://ipsnode1:8485;ipsnode2:8485;ipsnode3:8485/mycluster</value>
</property>
<!--配置editlog save dir-->
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/hadoop/log</value>
</property>
<!--配置故障切换实现类-->
<property>
<name>dfs.client.failover.proxy.provider.mycluster</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<!--指定切换方式为SSH免密钥方式sshfence需要安装psmisc(fuser),add shell增加保障-->
<property>
<name>dfs.ha.fencing.methods</name>
<value>
sshfence
shell(/bin/true)
</value>
</property>
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/root/.ssh/id_dsa</value>
</property>
<!--设置自动切换-->
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
</configuration>
2.2 YARN HA
2.2.1 vi yarn-site.xml
#################
<configuration>
<!-- 开启RM高可用 -->
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<!--是否启用自动故障转移。默认情况下,在启用 HA 时,启用自动故障转移。-->
<property>
<name>yarn.resourcemanager.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
<!--启用内置的自动故障转移。默认情况下,在启用 HA 时,启用内置的自动故障转移。-->
<property>
<name>yarn.resourcemanager.ha.automatic-failover.embedded</name>
<value>true</value>
</property>
<!-- 指定RM的cluster id -->
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>yrc</value>
</property>
<!-- 指定RM的名字 -->
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
</property>
<!-- 分别指定RM的地址 -->
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>node01</value>
</property>
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>node02</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm1</name>
<value>node01:8088</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm2</name>
<value>node02:8088</value>
</property>
<!--启用 resourcemanager 自动恢复-->
<property>
<name>yarn.resourcemanager.recovery.enabled</name>
<value>true</value>
</property>
<!-- 指定zk集群地址 -->
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>node02:2181,node03:2181,node04:2181</value>
</property>
</configuration>
三. 启动
说明:进程绿色标记为本次新产生
3.1 zkServer.sh start #start zookeeper in all nodes
[root@ipsnode1 bin]# jps
2435 QuorumPeerMain
2476 Jps
3.2 hdfs --daemon start journalnode #in all nodes
[root@ipsnode1 bin]# jps
2435 QuorumPeerMain
2564 Jps
2525 JournalNode
3.3 hdfs zkfc -formatZK #生成hadoop-ha目录
3.4 hdfs namenode -format #primary nn
3.5 hdfs --daemon start namenode #primary nn
[root@ipsnode1 bin]# jps
2435 QuorumPeerMain
2747 NameNode
2525 JournalNode
2814 Jps
3.6 hdfs namenode -bootstrapStandby #standby nn,sync active namenode info
[root@ipsnode2 bin]# jps
2624 Jps
2421 QuorumPeerMain
2518 JournalNode
3.7 start-dfs.sh
[root@ipsnode1 sbin]# jps
2435 QuorumPeerMain
3108 DataNode
2747 NameNode
3499 DFSZKFailoverController
3547 Jps
2525 JournalNode
[root@ipsnode2 bin]# jps
2994 Jps
2421 QuorumPeerMain
2773 DataNode
2518 JournalNode
2953 DFSZKFailoverController
2683 NameNode
[root@ipsnode3 bin]# jps
2624 DataNode
2736 Jps
2421 QuorumPeerMain
2501 JournalNode
3.8 start-yarn.sh
[root@ipsnode1 sbin]# jps
4961 ResourceManager
5090 NodeManager
4503 NameNode
3464 DFSZKFailoverController
3083 DataNode
5244 Jps
2429 QuorumPeerMain
2510 JournalNode
[root@ipsnode2 bin]# jps
2849 DFSZKFailoverController
2419 QuorumPeerMain
3685 NodeManager
3798 Jps
2695 DataNode
3433 NameNode
2507 JournalNode
[root@ipsnode3 data]# jps
2496 JournalNode
2992 NodeManager
2596 DataNode
2918 ResourceManager
3117 Jps
2415 QuorumPeerMain
#yarn --daemon start resourcemanager #standby
四. 测试
Check HDFS HA
###############
http://192.168.100.101:50070/&http://192.168.100.102:50070/
测试切换:
kill -9 namenodepid #node1
hdfs --daemon start namenode #node1
Check YARN HA
###############
http://192.168.100.101:8088/cluster http://192.168.100.103:8088/cluster/cluster
测试切换:
kill -9 resourcemanagepid #node1
yarn --daemon start resourcemanager #node1