0、节点及角色安排
节点 | NameNode | SecondaryNameNod | DataNode | ResourceManager | NodeManager |
---|---|---|---|---|---|
node1 | √ | √ | √ | √ | |
node2 | √ | √ | √ | ||
node3 | √ | √ |
1、下载 hadoop-3.3.5 安装包
http://archive.apache.org/dist/hadoop/common/hadoop-3.3.5/hadoop-3.3.5.tar.gz
下载好后传到主节点上
2、解压到 /opt/module目录下
tar -zxvf hadoop-3.3.5.tar.gz -C /opt/module
3、修改hadoop权限
sudo chown -R bigdata:bigdata /opt/module/hadoop-3.3.5
4、添加环境变量
vi /etc/profile
#HADOOP_HOME
export HADOOP_HOME=/opt/module/hadoop-3.3.5
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
5、创建hadoop数据目录
node1本地
mkdir /data/hadoop
mkdir /data/hadoop/hdfs
mkdir /data/hadoop/yarn
node2、node3
mkdir /data/hadoop
sudo mkdir /data/hadoop/hdfs
sudo mkdir /data/hadoop/yarn
sudo chown -R bigdata:bigdata /data/hadoop
6、修改Hadoop配置文件
core-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<!--hdfs端口/url配置-->
<property>
<name>fs.defaultFS</name>
<value>hdfs://node1:9000</value>
</property>
<!--临时目录-->
<property>
<name>hadoop.tmp.dir</name>
<value>file:/data/hadoop/tmp</value>
<description>Abase for other temporary directories.</description>
</property>
<!-- 缓冲区大小,实际工作中根据服务器性能动态调整 4096为默认值 -->
<property>
<name>io.file.buffer.size</name>
<value>4096</value>
</property>
<!--回收站配置-->
<property>
<name>fs.trash.interval</name>
<value>1440</value>
</property>
<property>
<name>fs.trash.checkpoint.interval</name>
<value>60</value>
</property>
</configuration>
hdfs-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<!--数据目录-->
<property>
<name>dfs.namenode.name.dir</name>
<value>file:/data/hadoop/hdfs/nn</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:/data/hadoop/hdfs/data</value>
</property>
<!--hdfs备份数配置-->
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<!--线程配置(当前数值为cdh默认)-->
<!--NN处理程序线程数-->
<property>
<name>dfs.namenode.handler.count</name>
<value>30</value>
</property>
<!--NN服务处理程序线程数-->
<property>
<name>dfs.namenode.service.handler.count</name>
<value>30</value>
</property>
<!--DN处理程序线程数-->
<property>
<name>dfs.datanode.handler.count</name>
<value>3</value>
</property>
<!--DN最大传输程数-->
<property>
<name>dfs.datanode.max.transfer.threads</name>
<value>4096</value>
</property>
<!--secondary namenode配置-->
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>node2:50090</value>
</property>
<!--权限管理-->
<property>
<name>dfs.permissions</name>
<value>false</value>
</property>
<property>
<name>fs.permissions.umask-mode</name>
<value>022</value>
</property>
<property>
<name>dfs.namenode.acls.enabled</name>
<value>false</value>
</property>
</configuration>
mapred-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<!-- 指定分布式计算使用的框架是yarn -->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<!-- 开启MapReduce小任务模式 -->
<property>
<name>mapreduce.job.ubertask.enable</name>
<value>true</value>
</property>
<!-- 设置历史任务的主机和端口 -->
<property>
<name>mapreduce.jobhistory.address</name>
<value>node1:10020</value>
</property>
<!-- 设置网页访问历史任务的主机和端口 -->
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>node1:19888</value>
</property>
<!-- 环境参数 -->
<property>
<name>yarn.app.mapreduce.am.env</name>
<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
<property>
<name>mapreduce.map.env</name>
<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
<property>
<name>mapreduce.reduce.env</name>
<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
</configuration>
yarn-site.xml
<?xml version="1.0"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<configuration>
<!-- Site specific YARN configuration properties -->
<!--ResourceManager地址与端口-->
<property>
<name>yarn.resourcemanager.hostname</name>
<value>node1</value>
</property>
<!--NodeManager 本地目录-->
<property>
<name>yarn.nodemanager.local-dirs</name>
<value>/data/hadoop/yarn/nm</value>
</property>
<!--每个nodemanager可使用的cpu/内存 -->
<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>28</value>
</property>
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>81920</value>
</property>
<!--container日志存储 / CDH配置了日志转移到HDFS-->
<!--本地container日志存储路径-->
<property>
<name>yarn.nodemanager.log-dirs</name>
<value>/data/hadoop/yarn/container-logs</value>
</property>
<!--开启日志聚合-->
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<!--日志聚合hdfs存储路径-->
<property>
<name>yarn.nodemanager.remote-app-log-dir</name>
<value>/tmp/logs</value>
</property>
<property>
<name>yarn.nodemanager.remote-app-log-dir-suffix</name>
<value>logs</value>
</property>
<!--hdfs上的日志保留时间-->
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>604800</value>
</property>
<property>
<!--应用执行完日志保留的时间,默认0,即执行完本地立刻删除-->
<name>yarn.nodemanager.delete.debug-delay-sec</name>
<value>0</value>
</property>
<!--日志web url-->
<property>
<name>yarn.log.server.url</name>
<value>http://node1:19888/jobhistory/logs/</value>
</property>
<!--Thread-count配置(值为cdh默认)-->
<!--RM-->
<!--RM客户端线程数-->
<property>
<name>yarn.resourcemanager.client.thread-count</name>
<value>50</value>
</property>
<!--RM调度程序线程数-->
<property>
<name>yarn.resourcemanager.scheduler.client.thread-count</name>
<value>50</value>
</property>
<!--RM管理客户端线程数-->
<property>
<name>yarn.resourcemanager.admin.client.thread-count</name>
<value>1</value>
</property>
<!--RM资源跟踪器线程数-->
<property>
<name>yarn.resourcemanager.resource-tracker.client.thread-count</name>
<value>50</value>
</property>
<!--NM-->
<!--NM容器管理器线程数-->
<property>
<name>yarn.nodemanager.container-manager.thread-count</name>
<value>20</value>
</property>
<!--NM清理线程数-->
<property>
<name>yarn.nodemanager.delete.thread-count</name>
<value>4</value>
</property>
<!--Yarn Scheduler配置 Contain的cpu内存上下限-->
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>1024</value>
</property>
<property>
<name>yarn.scheduler.increment-allocation-mb</name>
<value>512</value>
</property>
<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>65536</value>
</property>
<property>
<name>yarn.scheduler.minimum-allocation-vcores</name>
<value>1</value>
</property>
<property>
<name>yarn.scheduler.increment-allocation-vcores</name>
<value>1</value>
</property>
<property>
<name>yarn.scheduler.maximum-allocation-vcores</name>
<value>32</value>
</property>
<!-- Scheduler配置 CDH使用的是FairScheduler (也可先不配 用默认)
<property>
<name>yarn.resourcemanager.scheduler.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
</property>
-->
<!--Mr参数-->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>
workers文件
vi works
node1
node2
node3
在hadoop-env.sh配置JAVA_HOME与各进程jvm内存
export JAVA_HOME=/usr/java/jdk1.8.0_212
export HDFS_NAMENODE_OPTS="-Xms4294967296 -Xmx4294967296 -Dhadoop.security.logger=INFO,RFAS"
export HDFS_SECONDARYNAMENODE_OPTS="-Xms1073741824 -Xmx1073741824 -Dhadoop.security.logger=INFO,RFAS"
export HDFS_DATANODE_OPTS="-Xms2147483648 -Xmx2147483648 -Dhadoop.security.logger=ERROR,RFAS"
配置yarn-env.sh各进程jvm内存
export YARN_RESOURCEMANAGER_OPTS="-Xms2147483648 -Xmx2147483648 -Drm.audit.logger=INFO,RMAUDIT"
export YARN_NODEMANAGER_OPTS="-Xms2147483648 -Xmx2147483648 -Dnm.audit.logger=INFO,NMAUDIT"
配置hadoop的日志输出
配置log4j.properties中的hadoop.log.dir不生效
需配置hadoop-env.sh
vim /opt/module/hadoop-3.3.5/etc/hadoop/hadoop-env.sh
#修改
export HADOOP_LOG_DIR=/data/log/hadoop
#pid默认存在/tmp 也许修改
export HADOOP_PID_DIR=/data/hadoop
7、将配置好的文件拷贝到其他节点
scp -r /opt/module/hadoop-3.3.5 node2:/opt/module/
scp -r /opt/module/hadoop-3.3.5 node3:/opt/module/
#修改权限
sudo chown -R bigdata:bigdata /opt/module/hadoop-3.3.5/
8、格式化namenode
/opt/module/hadoop-3.3.5/bin/hdfs namenode -format
9、配置集群启动
hdfs:
vi /opt/bash/hdfs.sh
#!/bin/bash
case $1 in
start)
/opt/module/hadoop-3.3.5/sbin/start-dfs.sh
;;
stop)
/opt/module/hadoop-3.3.5/sbin/stop-dfs.sh
;;
*)
echo "输入的参数不对"
echo " start 启动hdfs集群"
echo " stop 停止hdfs集群"
;;
esac
修改权限
chmod 744 /opt/bash/hdfs.sh
可以使用命令启动与关闭组件:
/opt/bash/hdfs.sh start
/opt/bash/hdfs.sh stop
yarn:
vi /opt/bash/yarn.sh
#!/bin/bash
case $1 in
start)
/opt/module/hadoop-3.3.5/sbin/start-yarn.sh
;;
stop)
/opt/module/hadoop-3.3.5/sbin/stop-yarn.sh
;;
*)
echo "输入的参数不对"
echo " start 启动yarn集群"
echo " stop 停止yarn集群"
;;
esac
修改权限
chmod 744 /opt/bash/yarn.sh
可以使用命令启动与关闭组件:
/opt/bash/yarn.sh start
/opt/bash/yarn.sh stop
10、jps验证
node1:
DateNode
NameNode
ResourceManager
NodeManager
Jps
node2:
DataNode
NodeManager
SecondaryNameNode
Jps
node3:
DataNode
NodeManager
Jps
Web端口查看集群状况
在浏览器输入:http://node1:8088打开ResourceManager页面。
在浏览器输入:http://node1:9870打开Hadoop Namenode页面。
11、使用systemctl 管理hdfs和yarn
因为每个节点hdfs和yarn启动的角色不太一样,所以需要根据启动的角色进行配置
创建hdfs.service文件
sudo vim /usr/lib/systemd/system/hdfs.service
node1 配置
[Unit]
Description=hdfs
Requires=network-online.target
After=network-online.target
[Service]
Type=forking
User=bigdata
Group=bigdata
Restart=on-failure
WorkingDirectory=/opt/module/hadoop-3.3.5
ExecStart=/opt/module/hadoop-3.3.5/sbin/hadoop-daemon.sh start namenode
[Install]
WantedBy=multi-user.target
node2 配置
[Unit]
Description=hdfs
Requires=network-online.target
After=network-online.target
[Service]
Type=forking
User=bigdata
Group=bigdata
Restart=on-failure
WorkingDirectory=/opt/module/hadoop-3.3.5
ExecStart=/bin/bash -c '/opt/module/hadoop-3.3.5/sbin/hadoop-daemon.sh start secondarynamenode && /opt/module/hadoop-3.3.5/sbin/hadoop-daemon.sh start datanode'
[Install]
WantedBy=multi-user.target
node3配置
[Unit]
Description=hdfs
Requires=network-online.target
After=network-online.target
[Service]
Type=forking
User=bigdata
Group=bigdata
Restart=on-failure
WorkingDirectory=/opt/module/hadoop-3.3.5
ExecStart=/opt/module/hadoop-3.3.5/sbin/hadoop-daemon.sh start datanode
[Install]
WantedBy=multi-user.target
创建yarn.service文件
sudo vim /usr/lib/systemd/system/yarn.service
node1配置
[Unit]
Description=hdfs
Requires=network-online.target
After=network-online.target
[Service]
Type=forking
User=bigdata
Group=bigdata
Restart=on-failure
WorkingDirectory=/opt/module/hadoop-3.3.5
ExecStart=/bin/bash -c '/opt/module/hadoop-3.3.5/sbin/yarn-daemon.sh start resourcemanager && /opt/module/hadoop-3.3.5/sbin/yarn-daemon.sh start nodemanager'
[Install]
WantedBy=multi-user.target
node2、node3配置
[Unit]
Description=hdfs
Requires=network-online.target
After=network-online.target
[Service]
Type=forking
User=bigdata
Group=bigdata
Restart=on-failure
WorkingDirectory=/opt/module/hadoop-3.3.5
ExecStart=/opt/module/hadoop-3.3.5/sbin/yarn-daemon.sh start nodemanager
[Install]
WantedBy=multi-user.target
启动、关闭验证是否配置成功
sudo systemctl start hdfs/yarn
sudo systemctl stop hdfs/yarn
sudo systemctl status hdfs/yarn
验证成功配置开机自启
sudo systemctl enable hdfs/yarn