目录
下载java
安装zookeeper
创建Hadoop集群
Hadoop实例学习(一)Hadoop3.x集群搭建
!!!HA搭建最主要的就是Hadoop的四个配置文件
添加环境变量
配置core.site.xml文件
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://ns1</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/usr/hadoop/hadoop-3.1.4/tmp</value>
</property>
<property>
<name>hadoop.http.staticuser.user</name>
<value>hadoop</value>
</property>
<property>
<name>ha.zookeeper.quorum</name>
<value>master1:2181,slave1:2181,slave2:2181</value>
</property>
<property>
<name>hadoop.proxyuser.hadoop.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hadoop.groups</name>
<value>*</value>
</property>
</configuration>
配置hdfs.site.xml文件
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>dfs.ha.automatic-failover.enabled.ns</name>
<value>true</value>
</property>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
<property>
<name>dfs.nameservices</name>
<value>ns1</value>
</property>
<property>
<name>dfs.blocksize</name>
<value>134217728</value>
</property>
<property>
<name>dfs.ha.namenodes.ns1</name>
<value>nn1,nn2</value>
</property>
<!-- nn1的RPC通信地址,nn1所在地址 -->
<property>
<name>dfs.namenode.rpc-address.ns1.nn1</name>
<value>master1:9000</value>
</property>
<!-- nn1的http通信地址,外部访问地址 -->
<property>
<name>dfs.namenode.http-address.ns1.nn1</name>
<value>master1:50070</value>
</property>
<!-- nn2的RPC通信地址,nn2所在地址 -->
<property>
<name>dfs.namenode.rpc-address.ns1.nn2</name>
<value>slave1:9000</value>
</property>
<!-- nn2的http通信地址,外部访问地址 -->
<property>
<name>dfs.namenode.http-address.ns1.nn2</name>
<value>slave1:50070</value>
</property>
<!-- 指定NameNode的元数据在JournalNode日志上的存放位置(一般和zookeeper部署在一起) -->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://master1:8485;slave1:8485/ns1</value>
</property>
<!-- 指定JournalNode在本地磁盘存放数据的位置 -->
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/usr/hadoop/hadoop-3.1.4/journaldata</value>
</property>
<!--开启namenode失败自动切换-->
<property>
<name>dfs.ha.automatic-failover.enabled.ns1</name>
<value>true</value>
</property>
<!--客户端通过代理访问namenode,访问文件系统,HDFS 客户端与Active 节点通信的Java 类,使用其确定Active 节点是否活跃 -->
<property>
<name>dfs.client.failover.proxy.provider.ns1</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<!--这是配置自动切换的方法,有多种使用方法,具体可以看官网,在文末会给地址,这里是远程登录杀死的方法 -->
<property>
<name>dfs.ha.fencing.methods</name>
<value>
sshfence
shell(/bin/true)
</value>
</property>
<!-- 这个是使用sshfence隔离机制时才需要配置ssh免登陆 -->
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/root/.ssh/id_rsa</value>
</property>
<!-- 配置sshfence隔离机制超时时间,这个属性同上,如果你是用脚本的方法切换,这个应该是可以不配置的 -->
<property>
<name>dfs.ha.fencing.ssh.connect-timeout</name>
<value>30000</value>
</property>
<!-- 这个是开启自动故障转移,如果你没有自动故障转移,这个可以先不配 -->
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
<!-- HDFS文件系统元信息保存目录 -->
<property>
<name>dfs.name.dir</name>
<value>file:/usr/hadoop/hadoop-3.1.4/name</value>
</property>
<!-- HDFS文件系统数据保存目录 -->
<property>
<name>dfs.data.dir</name>
<value>file:/usr/hadoop/hadoop-3.1.4/data</value>
</property>
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
</configuration>
配置yarn.site.xml文件
<?xml version="1.0"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<configuration>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- Site specific YARN configuration properties -->
<!--启用resourcemanager ha-->
<!--是否开启RM ha,默认是开启的-->
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<!-- 指定resourcemanager的名字 -->
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>yrc</value>
</property>
<!-- 使用了2个resourcemanager,分别指定Resourcemanager的地址 -->
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
</property>
<!-- 指定rm1的地址 -->
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>master1</value>
</property>
<!-- 指定rm2的地址 -->
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>slave1</value>
</property>
<property>
<name>yarn.application.classpath</name>
<value>/usr/hadoop/hadoop-3.1.4/etc/hadoop:/usr/hadoop/hadoop-3.1.4/share/hadoop/common/lib/*:/usr/hadoop/hadoop-3.1.4/share/hadoop/common/*:/usr/hadoop/hadoop-3.1.4/share/hadoop/hdfs:/usr/hadoop/hadoop-3.1.4/share/hadoop/hdfs/lib/*:/usr/hadoop/hadoop-3.1.4/share/hadoop/hdfs/*:/usr/hadoop/hadoop-3.1.4/share/hadoop/mapreduce/lib/*:/usr/hadoop/hadoop-3.1.4/share/hadoop/mapreduce/*:/usr/hadoop/hadoop-3.1.4/share/hadoop/yarn:/usr/hadoop/hadoop-3.1.4/share/hadoop/yarn/lib/*:/usr/hadoop/hadoop-3.1.4/share/hadoop/yarn/*</value>
</property>
<!--指定zookeeper集群的地址-->
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>master1:2181,slave1:2181,slave2:2181</value>
</property>
<!--启用自动恢复,当任务进行一半,rm坏掉,就要启动自动恢复,默认是false-->
<property>
<name>yarn.resourcemanager.recovery.enabled</name>
<value>true</value>
</property>
<!-- NodeManager上运行的附属服务,默认是mapreduce_shuffle -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.resourcemanager.address.rm1</name>
<value>master1:8032</value>
</property>
<property>
<name>yarn.resourcemanager.address.rm2</name>
<value>slave1:8032</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm2</name>
<value>slave1:8030</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm2</name>
<value>slave1:8088</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm2</name>
<value>slave1:8031</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address.rm2</name>
<value>slave1:8033</value>
</property>
<property>
<name>yarn.resourcemanager.ha.admin.address.rm2</name>
<value>slave1:23142</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm1</name>
<value>master1:8030</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm1</name>
<value>master1:8088</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm1</name>
<value>master1:8031</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address.rm1</name>
<value>master1:8033</value>
</property>
<property>
<name>yarn.resourcemanager.ha.admin.address.rm1</name>
<value>master1:23142</value>
</property>
<!--指定resourcemanager的状态信息存储在zookeeper集群,默认是存放在FileSystem里面。-->
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
</property>
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
<description>
忽略虚拟内存的检查,如果你是安装在虚拟机上,这个配置很有用,配上去之后后续操作不容易出问题。 </description>
</property>
</configuration>
配置mapred.site.xml文件
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<!--告诉hadoop以后MR(Map/Reduce)运行在YARN上-->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>yarn.application.classpath</name>
<value>/usr/hadoop/hadoop-3.1.4/etc/hadoop/mapreduce/*,/usr/hadoop/hadoop-3.1.4/etc/hadoop/mapreduce/lib/*</value>
</property>
</configuration>
配置works文件
master1
slave1
slave2
启动HA集群
启动zookeeper集群
启动命令:
[root@master1 hadoop]# zkServer.sh start
[root@slave1 hadoop]# zkServer.sh start
[root@slave2 hadoop]# zkServer.sh start
查看状态
[root@master1 hadoop]# zkServer.sh status
[root@slave1 hadoop]# zkServer.sh status
[root@mslave2 hadoop]# zkServer.sh status
先启动journalnode
在三台虚拟机三启动journalnode
启动命令:
[root@master1 hadoop]# hadoop-daemon.sh start journalnode
[root@slave1 hadoop]# hadoop-daemon.sh start journalnode
[root@slave2 hadoop]# hadoop-daemon.sh start journalnode
在master1上格式化
[root@master1 hadoop]# hdfs namenode -format
!!!将格式化后的hadoop文件发送给slave1(另一个namenode)
[root@master1 hadoop]# scp -r hadoop-3.1.4 root@slave1:/usr/hadoop/
在master1上格式化ZKFC
[root@master1 hadoop]# hdfs zkfc -formatZK
启动HDFS和YARN
先修改它们的启动文件
在start-dfs.sh和stop-dfs.sh文件中加上下列代码
HDFS_NAMENODE_USER=root
HDFS_DATANODE_USER=root
HDFS_DATANODE_SECURE_USER=root
HDFS_SECONDARYNAMENODE_USER=root
HDFS_JOURNALNODE_USER=root
HDFS_ZKFC_USER=root
在start-yarn.sh和stop-yarn.sh文件中加上下列代码
YARN_RESOURCEMANAGER_USER=root
HADOOP_SECURE_DN_USER=yarn
YARN_NODEMANAGER_USER=root
启动dfs
[root@master1 hadoop]# start-dfs.sh
启动yarn
[root@master1 hadoop]# start-yarn.sh
查看jps
master1的jps
slave1的jps
slave2的jps
浏览器查看
master1
slave1