(1) 添加HADOOP命令以及JDK到环境变量中
~]# vim /etc/profile
export JAVA_HOME=/app/jdk1.8.0_60
export PATH=$JAVA_HOME/bin:$PATH
~]# vim /etc/profile.d/hadoop.sh
export HADOOP_HOME=/app/hadoop-2.7.2 //根据实际安装位置修改
export PATH=$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH
(2) 修改hadoop中的core-site.xml以方便使用hdfs
<property>
<name>fs.defaultFS</name>
<value>hdfs://node3:8020</value> //hdfs安装节点URI
</property>
(3) 修改hadoop中的hadoop-env.sh文件中的JAVA_HOME;
JAVA_HOME=/app/jdk1.8.0_60 //根据实际JDK安装情况修改
(4) 启动Namenode以及Datanode并格式化Namenode
~]# hdfs namenode -format //格式化namenode,切记,只有第一次需要格式化
~]# hadoop-daemon.sh start namenode //启动namenode
~]# jps //查看是否有"namenode",如果有则说明启动成功
~]# hadoop-daemon.sh start datanode //启动datanode
(5) 配置伪分布式集群模式
~]# vim $HADOOP_HOME/etc/hadoop/hdfs-site.xml
<property>
<name>dfs.repication</name>
<value>1</value>
</property>
(6) 跑测试MapReduce程序
~]# mkdir $HADOOP_HOME/input
~]# cp $HADOOP_HOME/etc/hadoop/*.xml $HADOOP_HOME/input
~]# hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.2.jar grep input output 'hdf[a-z]+'
// input是我们创建的目录,指定mapreduce从哪里读数据,grep是筛选的意思,output是输出目录,不存在的话会自动创建
~]# ls $HADOOP_HOME/part* // 会查看到处理后的数据
(7) 在HDFS上跑MapReduce程序
~]# hdfs dfs -mkdir -p /home/fang/input
~]# vim test.txt // 编写测试文件
fang
Google
learn
Hadoop
Mapreduce
fang
~]# hdfs dfs -put test.txt /home/fang/input // 将本地创建的test.txt传送到hdfs上的指定目录下
~]# hadoop jar $HADOOP_HOME/share/hadoop/mapreduce/hadoop-mapreduce-examples-2.7.2.jar wordcount /home/fang/input /home/fang/output
// wordcount是统计文件中的单词个数,从hdfs上的"/home/fang/input"中读取文件,处理后传到hdfs的"/home/fang/output"
http://node3:50070 --> "Utilities" --> Browse the file system 可查看到hdfs上存储的数据(会看到在指定目录下生成的output文件夹)
(8) 在Yarn上跑MapReduce程序
~]# vim $HADOOP_HOME/etc/hadoop/yarn-env.sh
JAVA_HOME=/app/jdk1.8.0_60
~]# vim $HADOOP_HOME/etc/hadoop/mapred-env.sh
JAVA_HOME=/app/jdk1.8.0_60
~]# cp $HADOOP_HOME/mapred-site.xml.template mapred-site.xml
~]# vim mapred-site.xml
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
~]# vim $HADOOP_HOME/etc/hadoop/yarn-site.xml
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>node3</value>
</property>
~]# yarn-daemon.sh start resourcemanager
~]# yarn-daemon.sh start nodemanager
http://node3:8088 --> application 可查看到MapReduce跑的任务
(9) 修改本地文件临时存储目录
~]# mkdir -p $HADOOP_HOME/data/tmp
~]# vim $HADOOP_HOME/etc/hadoop/core-site.xml
<proprety>
<name>hadoop.tmp.dir</name>
<value>/app/hadoop-2.7.2/data/tmp</value>
<proprety>
~]# yarn-daemon.sh stop nodemanager
~]# yarn-daemon.sh stop resourcemanager
~]# hadoop-daemon.sh stop datanode
~]# hadoop-daemon.sh stop namenode
~]# rm -rf /tmp/hadoop-root //删除之前生成的临时目录
~]# hdfs namenode -format // 重新格式化namenode
// 再全部启动
(10) Hadoop相关配置文件(core-site.xml)
~]# cd $HADOOP_HOME/share/hadoop/common/
~]# unzip hadoop-common-2.7.2.jar -d /app
~]# vim /app/core-site.xml // 此文件中是配置文件的模板,可修改完后将内容复制到$HADOOP_HOME/etc/hadoop/core-site.xml
(11) Hadoop相关配置文件(hdfs-default.xml)
~]# cd $HADOOP_HOME/share/hadoop/hdfs
~]# unzip hadoop-hdfs-2.7.2.jar -d /app
(12) Hadoop相关配置文件(yarn-default.xml)
~]# cd $HADOOP_HOME/share/hadoop/yarn
~]# unzip hadoop-yarn-common-2.7.2.jar -d /app
(13) Hadoop相关配置文件(mapred-default.xml)
~]# cd $HADOOP_HOME/share/hadoop/mapreduce
~]# unzip hadoop-mapreduce-client-core-2.7.2.jar -d /app
Hadoop分布式集群的配置
一、步骤
(1) 配置免密
(2) 编写xsync同步文件脚本
(3) 编写xcall可在所有节点执行命令脚本
(4) 集群的部署规划
二、编写xsync脚本
~]# vim /usr/bin/xsync
#!/bin/bash
#判断是否有传入参数
pcount=$#
if (($pcount==0));then
echo "Please input arg!" && exit 2;
fi
#取参数的文件名
f1=$1
fname=`basename $fname`
echo fname=$fname
#取参数的绝对路径
pdir=`cd -P $(dirname $f1)`
echo pdir=$pdir
#获取当前用户名称
user=`whoami`
#循环,这里host根据自己的节点数和主机名设置
for((host=1;host<=3;host++))
do
echo --------------- hadoop$host ----------------
rsync -rvl $pdir/$fname $user@node$host:$pdir
done
~]# chmod +x /usr/bin/xsync
~]# xsync /app/te
fname=te
pdir=/app
/app/te root@node1:/app
==================slave1==================
sending incremental file list
sent 25 bytes received 12 bytes 24.67 bytes/sec
total size is 0 speedup is 0.00
/app/te root@node2:/app
==================slave2==================
sending incremental file list
te
sent 64 bytes received 31 bytes 190.00 bytes/sec
total size is 0 speedup is 0.00
/app/te root@node3:/app
==================slave3==================
sending incremental file list
te
sent 64 bytes received 31 bytes 190.00 bytes/sec
total size is 0 speedup is 0.00
三、编写xcall脚本
~]# vim /usr/bin/xcall
#!/bin/bash
pcount=$#
if (($pcount==0));then
echo "Please input arg!" && exit 2;
fi
user=`whoami`
echo -------------localhost----------
$@
for ((host=1;host<=3;host++))
do
echo ----------node$host---------
ssh $user@node$host $@
done
~]# chmod +x /usr/bin/xcall
~]# xcall ls /app // 即会打印node{1..3}中的文件
四、集群的部署规划
node1 DataNode、Nodemanager
node2 DataNode、Nodemanager、SecondaryNameNode
node3 HDFS、Namenode、DataNode、ResourceManager
五、配置集群
操作Node3
~]# vim $HADOOP_HOME/etc/hadoop/core-site.xml
<property>
<name>fs.defaultFS</name>
<value>hdfs://node3:8020</value> //hdfs安装节点URI
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/app/hadoop-2.7.2/data/tmp</value>
</property>
</configuration>
~]# vim $HADOOP_HOME/etc/hadoop/hadoop-env.sh
export JAVA_HOME=/app/jdk1.8.0_60
~]# vim $HADOOP_HOME/etc/hadoop/hdfs-site.xml
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>node2:50090</value>
</property>
~]# vim $HADOOP_HOME/etc/hadoop/slave // DataNode所在节点
node1
node2
node3
~]# vim $HADOOP_HOME/etc/hadoop/yarn-env.sh
JAVA_HOME=/app/jdk1.8.0_60/
~]# vim $HADOOP_HOME/etc/hadoop/yarn-site.xml
<property>
<name>yarn.resourcemanager.hostname</name>
<value>node3</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
~]# vim $HADOOP_HOME/etc/hadoop/mapred-env.sh
export JAVA_HOME=/app/jdk1.8.0_60/
~]# vim $HADOOP_HOME/etc/hadoop/mapred-site.xml
<property>
<name>mapreduce.framework</name>
<value>yarn</value>
</property>
六、在集群上分发所有配置文件
~]# xsync $HADOOP_HOME/etc/hadoop
七、启动集群环境
~]# hdfs namenode -format
~]# start-dfs.sh
~]# start-yarn.sh // 如果yarn与namenode不在同一台机器上就不能启动yarn,应该在Resourcemanager所在的机器上启动yarn
八、上传文件
~]# touch /app/test.txt
~]# hdfs dfs -put /app/test.txt /home/fang
九、下载文件
~]# hdfs dfs -get /home/fang/test /root
十、Hadoop启动停止方式
~]# hadoop-daemon.sh start|stop namenode|datanode|secondarynamenode // 逐一启动
~]# yarn-daemon.sh start|stop resourcemanager|nodemanager //逐一启动
~]# start-dfs.sh // 整体启动hdfs
~]# stop-dfs.sh // 整体停止hdfs
~]# start-yarn.sh // 整体启动yarn
~]# stop-yarn.sh // 整体停止yarn