大数据
hadoop
hadoop起源
HDFS常用组件
HDFS结构
Mapreduce
Mapresuce结构
Yarn结构
一、hadoop的安装
1、环境准备
(1)准备四台虚拟机
主机 角色
192.168.8.10(hadoop10) NameNode
192.168.8.11(hd01) DataNode
192.168.8.12(hd02) DataNode
192.168.8.13(hd03) DataNode
(2)编辑/etc/hosts(以hadoop10为例)
[root@hadoop10 ~]# vim /etc/hosts
192.168.8.10 hadoop10
192.168.8.11 hd01
192.168.8.12 hd02
192.168.8.13 hd03
(3)安装java环境(hadoop10)
[root@hadoop10 ~]# yum -y install java-1.8.0-openjdk-devel //其余Datanode节点主机也要安装
[root@hadoop10 ~]# tar -zxvf hadoop-2.7.6.tar.gz
[root@hadoop10 ~]#mv hadoop-2.7.6 /usr/local/hadoop/
[root@hadoop10 ~]#cd /usr/local/hadoop/
[root@hadoop10 hadoop]# rpm -ql java-1.8.0-openjdk //获取hadoop的java配置环境路径
[root@hadoop10 hadoop]# vim etc/hadoop/hadoop-env.sh
25 export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-1.8.0.131-11.b12.e l7.x86_64/jre/
33 export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop
[root@hadoop10 hadoop]# ./bin/hadoop //不报错则上一布配置成功
(4)布置SSH信任关系
[root@hadoop10 ~]# vim /etc/ssh/ssh_config
Host *
GSSAPIAuthentication yes
StrictHostKeyChecking no //免密验证
[root@hadoop10 ~]# cd /root/.ssh/
[root@hadoop10 .ssh]# ssh-keygen -t rsa -b 2048 -N '' //生成密钥
[root@hadoop10 .ssh]# for i in 10 11 12 13;do ssh-copy-id 192.168.8.$i;done //部署公钥(包括本机)
[root@hadoop10 .ssh]# ssh hd01 //测试信任关系(以免密登录hd01为例)
Last login: Fri Apr 26 17:00:08 2019 from 192.168.8.254
[root@hd01 ~]#
2、配置hadoop
(1)修改slaves文件
[root@hadoop10 ~]# cd /usr/local/hadoop/etc/hadoop/
[root@hadoop10 hadoop]# vim /usr/local/hadoop/etc/hadoop/slaves
hd01
hd02
hd03
(2)hadoop的核心配置文件core-site.xml
[root@hadoop10 hadoop]# vim core-site.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://hadoop10:9000</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/var/hadoop</value>
</property>
<property>
</configuration>
[root@hadoop10 hadoop]# mkdir /var/hadoop //创建hadoop的数据根目录
[root@hadoop10 hadoop]# ssh hd01 mkdir /var/hadoop
[root@hadoop10 hadoop]# ssh hd02 mkdir /var/hadoop
[root@hadoop10 hadoop]# ssh hd03 mkdir /var/hadoop
(3)配置hdfs-site文件
[root@hadoop10 hadoop]# vim hdfs-site.xml
<configuration>
<property>
<name>dfs.namenode.http-address</name>
<value>hadoop10:50070</value>
</property>
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>hadoop10:50090</value>
</property>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
</configuration>
(4)同步配置文件到hd01、hd02、hd03
[root@hadoop10 hadoop]# yum -y install rsync //同步的主机都要安装rsync
[root@hadoop10 hadoop]# for i in 11 12 13;do rsync -aSH --delete /usr/local/hadoop/ 192.168.8.$i:/usr/local/hadoop/;done
[root@hadoop10 hadoop]# ssh hd01 ls /usr/local/hadoop/ //以查看hd01同步结果为例
3、格式化,启动hadoop
[root@hadoop10 hadoop]# cd /usr/local/hadoop/
[root@hadoop10 hadoop]# ./bin/hdfs namenode -format
[root@hadoop10 hadoop]# ./sbin/start-dfs.sh
[root@hadoop10 hadoop]# jps //验证角色
8275 Jps
7262 NameNode
7455 SecondaryNameNode
[root@hadoop10 hadoop]# ./bin/hdfs dfsadmin -report //查看集群是否组建成功
Live datanodes (3): //有三个datanode节点
4、Mapreduce和Yarn的部署
(1)给master(hadoop10)主机添加ResourceManager的角色,给hd01、hd02、hd03添加NodeManager的角色
[root@hadoop10 hadoop]# cd etc/hadoop/
[root@hadoop10 hadoop]# mv mapred-site.xml.template mapred-site.xml
[root@hadoop10 hadoop]# vim mapred-site.xml
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
[root@hadoop10 hadoop]# vim yarn-site.xml
<configuration>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>hadoop10</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>
[root@hadoop10 hadoop]# for i in 11 12 13;do rsync -aSH --delete /usr/local/hadoop/ 192.168.8.$i:/usr/local/hadoop/;done
[root@hadoop10 hadoop]# cd /usr/local/hadoop/
[root@hadoop10 hadoop]# ./sbin/start-dfs.sh
[root@hadoop10 hadoop]# ./sbin/start-yarn.sh
[root@hadoop10 hadoop]# jps //查看hadoop10有ResourceManager角色
9299 ResourceManager
9558 Jps
7262 NameNode
7455 SecondaryNameNode
[root@hadoop10 hadoop]# ssh hd01 jps //查看hd01节点有NodeManager(hd02、hd03类似)
2357 NodeManager
2486 Jps
1899 DataNode
(2)web访问hadoop
http: //192.168.1.21: 50070/ //namenode web页面(nn01)
http: //192.168.1.21: 50090/ //secondory namenode web页面(nn01)
http: //192.168.1.22: 50075/ //datanode web页面(node1,node2,node3)
http: //192.168.1.21: 8088/ //resourcemanager web页面(nn01)
http: //192.168.1.22: 8042/ //nodemanager web页面(node1,node2,node3)
二、hadoop的简单应用
- HDFS基本命令
#./bin/hadoop fs -ls /
#./bin/hadoop fs -mkdir /abc
#./bin/hadoop fs -rmdir /abc
#./bin/hadoop fs -touchz /abc
#./bin/hadoop fs -cat /abc
#./bin/hadoop fs -rm /abc
#./bin/hadoop fs -put localfile /remotefile //上传文件
#./bin/hadoop fs -get /remotefile //下载文件
- 词频统计
[root@hadoop10 ~]# cd /usr/local/hadoop/
[root@hadoop10 hadoop]# ls
LICENSE.txt README.txt bb etc lib logs share
NOTICE.txt aa bin include libexec sbin
[root@hadoop10 hadoop]# ./bin/hadoop fs -mkdir /abc
[root@hadoop10 hadoop]# ./bin/hadoop fs -ls / //查看创建的/abc目录
[root@hadoop10 hadoop]# ./bin/hadoop fs -put **.txt /abc //上传文件
[root@hadoop10 hadoop]# ./bin/hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.6.jar wordcount /abc /bbb //调用Datanode节点对/abc目录下文件进行词频统计
[root@hadoop10 hadoop]# ./bin/hadoop fs -cat /bbb/* //查看统计结果
[root@hadoop10 hadoop]# ./bin/hadoop fs -get /bbb/* //下载/bbb目录下文件至当前目录
- 增加datanode节点(以增加new节点为例)
(1)准备一台虚拟机new(192.168.8.17)
(2)设置免密登录(/etc/ssh/ssh_config),拷贝NameNode(hadoop10)的公钥至本机
(3)所有节点修改/etc/hosts
192.168.8.10 hadoop10
192.168.8.11 hd01
192.168.8.12 hd02
192.168.8.13 hd03
192.168.8.17 new
(4)安装java运行环境和rsync同步包
(5)rsync拷贝NameNode(hadoop10)的/usr/local/hadoop 到本机
(6)修改NameNode(hadoop10)的slaves文件增加改该节点
(7)在new节点启动DataNode
[root@new ~]# cd /usr/local/hadoop/
[root@new hadoop]# ./sbin/hadoop-daemon.sh start datanode
[root@new hadoop]# jps
1923 Jps
1847 DataNode
- 上传文件
[root@hadoop10 hadoop]# cd /usr/local/hadoop/
[root@hadoop10 hadoop]# ./bin/hadoop fs -put /root/elk.tar /elk //上传文件elk.tar(大小为155M)
[root@hadoop10 hadoop]# ./bin/hdfs dfsadmin -report //查看datanode节点数据存储情况
Name: 192.168.8.17:50010 (new)
Hostname: new
Decommission Status : Normal
Configured Capacity: 107362627584 (99.99 GB)
DFS Used: 27893760 (26.60 MB)
Name: 192.168.8.12:50010 (hd02)
Hostname: hd02
Decommission Status : Normal
Configured Capacity: 107362627584 (99.99 GB)
DFS Used: 8192 (8 KB)
Name: 192.168.8.13:50010 (hd03)
Hostname: hd03
Decommission Status : Normal
Configured Capacity: 107362627584 (99.99 GB)
DFS Used: 163164160 (155.61 MB)
Name: 192.168.8.11:50010 (hd01)
Hostname: hd01
Decommission Status : Normal
Configured Capacity: 107362627584 (99.99 GB)
DFS Used: 135278592 (129.01 MB)
可以看到hd01、hd02、hd03和new节点上存放数据的大小分别为129M、8k、156M、26M,总和约为文件elk.tar大小的2倍,说明该上传文件在节点上存储了2份。
- 删除datanode节点(以删除new节点为例)
[root@hadoop10 hadoop]# cd etc/hadoop/
[root@hadoop10 hadoop]# vim hdfs-site.xml
<configuration>
...
<property>
<name>dfs.hosts.exclude</name> //增加dfs.hosts.exclude配置
<value>/usr/local/hadoop/etc/hadoop/exclude</value> //定义要删除的节点值
</property>
...
</configuration>
[root@hadoop10 hadoop]# vim /usr/local/hadoop/etc/hadoop/exclude //定义要删除的节点
new
[root@hadoop10 hadoop]# cd /usr/local/hadoop/
[root@hadoop10 hadoop]# ./bin/hdfs dfsadmin -report
Name: 192.168.8.17:50010 (new)
Hostname: new
Decommission Status : Normal //刷新节点前new节点显示正常状态
[root@hadoop10 hadoop]# ./bin/hdfs dfsadmin -refreshNodes //刷新节点信息
[root@hadoop10 hadoop]# ./bin/hdfs dfsadmin -report
Name: 192.168.8.17:50010 (new)
Hostname: new
Decommission Status : Decommissioned in Program //刷新节点后new节点显示数据正在迁移(表示需等待数据迁移完成)
[root@hadoop10 hadoop]# ./bin/hdfs dfsadmin -report //一段时间后再次查看new节点状态
Name: 192.168.8.17:50010 (new)
Hostname: new
Decommission Status : Decommissioned //new节点显示数据迁移完成
[root@hadoop10 hadoop]# vim etc/hadoop/slaves //删除定义的new节点
hd01
hd02
hd03
~~new~~
[root@new ~]# poweroff //关闭new节点
- 通过yarn添加datanode节点(以添加new节点为例)
[root@hadoop10 hadoop]# ./sbin/start-yarn.sh
[root@hadoop10 hadoop]# ./bin/yarn node -list
19/04/26 18:57:40 INFO client.RMProxy: Connecting to ResourceManager at hadoop10/192.168.8.10:8032
Total Nodes:3 //显示有三个节点
Node-Id Node-State Node-Http-Address Number-of-Running-Containers
hd02:41712 RUNNING hd02:8042 0
hd03:41337 RUNNING hd03:8042 0
hd01:32779 RUNNING hd01:8042 0
[root@new ~]# cd /usr/local/hadoop/
[root@new hadoop]# ./sbin/yarn-daemon.sh start nodemanager //在new节点上启动nodemanager
[root@new hadoop]# jps //查看new节点的NodeManager是否启动
869 NodeManager
971 Jps
[root@hadoop10 hadoop]# ./bin/yarn node -list
19/04/26 19:02:16 INFO client.RMProxy: Connecting to ResourceManager at hadoop10/192.168.8.10:8032
Total Nodes:4 //显示有四个节点,new节点添加成功
Node-Id Node-State Node-Http-Address Number-of-Running-Containers
hd02:41712 RUNNING hd02:8042 0
hd03:41337 RUNNING hd03:8042 0
new:44146 RUNNING new:8042 0
hd01:32779 RUNNING hd01:8042 0