#安装环境
centOS6.8
#JDK
1.8
#Hadoop
2.5.0
http://hadoop.apache.org/docs/r2.5.2/hadoop-project-dist/hadoop-common/SingleCluster.html
#hadoop 单节点配置
edit the file etc/hadoop/hadoop-env.sh
# set to the root of your Java installation
export JAVA_HOME=/usr/java/latest
#修改mapred-env.sh
#设置JAVA_HOME
export JAVA_HOME=/data/modules/jdk
#修改yarn-env.sh
export JAVA_HOME=/data/modules/jdk
配置好JDK之后,我们来看下Hadoop的版本号
[root@hadoop-master bin]# ./hadoop version
Hadoop 2.5.0
Subversion http://svn.apache.org/repos/asf/hadoop/common -r 1616291
Compiled by jenkins on 2014-08-06T17:31Z
Compiled with protoc 2.5.0
From source with checksum 423dcd5a752eddd8e45ead6fd5ff9a24
This command was run using /data/modules/hadoop-2.5.0/share/hadoop/common/hadoop-common-2.5.0.jar
#我们接着按官网的教程走
#编辑etc/hadoop/core-site.xml
#加入如下几个节点
#将localhost转换成自己的主机名
#并配置临时文件夹
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/data/modules/hadoop-2.5.0/hadoop-data/tmp</value>
</property>
</configuration>
#etc/hadoop/hdfs-site.xml:
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>
#配置完毕之后,我们来格式化一下namenode
[root@hadoop-master hadoop-2.5.0]# bin/hdfs namenode -format
报错
java.net.UnknownHostException: hadoop-master: hadoop-master: Name or service not known
#我们需要编辑/etc/hosts增加主机名和IP地址的映射
192.168.1.200 hadoop-master hadoop-master
#然后再次执行格式话namenode
#然后就成功了
#接下来我们来启动namenode
[root@hadoop-master hadoop-2.5.0]# sbin/hadoop-daemon.sh start namenode
starting namenode, logging to /data/modules/hadoop-2.5.0/logs/hadoop-root-namenode-hadoop-master.out
[root@hadoop-master hadoop-2.5.0]# jps
6075 Jps
6012 NameNode
[root@hadoop-master hadoop-2.5.0]# sbin/hadoop-daemon.sh start datanode
starting datanode, logging to /data/modules/hadoop-2.5.0/logs/hadoop-root-datanode-hadoop-master.out
[root@hadoop-master hadoop-2.5.0]# jps
6096 DataNode
6167 Jps
6012 NameNode
#启动成功后,通过宿主机访问50070,发现暂时不能访问
#发现防火墙没有关闭
#我们来关闭防火墙
[root@hadoop-master hadoop-2.5.0]# service iptables status
[root@hadoop-master hadoop-2.5.0]# service iptables stop
iptables: Setting chains to policy ACCEPT: filter [ OK ]
iptables: Flushing firewall rules: [ OK ]
iptables: Unloading modules: [ OK ]
[root@hadoop-master hadoop-2.5.0]# chkconfig iptables off
[root@hadoop-master hadoop-2.5.0]# service iptables status
iptables: Firewall is not running.
#关闭之后,我们就可以正常访问50070端口了
#然后我们在根目录下创建一个文件夹
[root@hadoop-master hadoop-2.5.0]# bin/hdfs dfs -mkdir /demo
18/05/15 00:48:55 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
#这里会报一个Native的错误,原因是本地没有Native的包,可以通过替换Native包来消灭这个警告
#放一个文件到hdfs上
[root@hadoop-master hadoop-2.5.0]# bin/hdfs dfs -put ../../example/1.txt /demo/
#查看1.txt文件内容
[root@hadoop-master hadoop-2.5.0]# bin/hdfs dfs -text /demo/1.txt
##################
#配置yarn
##################
把mapred-site.xml.template重命名为mapred-site.xml
加入
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
#编辑yarn-site.xml
加入
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>hadoop-master</value>
</property>
#配置好后,我们启动resourcemanager
[root@hadoop-master hadoop-2.5.0]# sbin/yarn-daemon.sh start resourcemanager
#然后在开启nodemanager
[root@hadoop-master hadoop-2.5.0]# sbin/yarn-daemon.sh start nodemanager
#然后访问8088端口
http://192.168.1.200:8088/cluster
#运行一个官方示例
[root@hadoop-master hadoop-2.5.0]# bin/yarn jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.5.0.jar wordcount /demo/1.txt /output
#######################
#配置historyserver
#######################
#修改mapred-site.xml
#加入如下节点
<property>
<name>mapreduce.jobhistory.address</name>
<value>hadoop-master:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>hadoop-master:19888</value>
</property>
#然后开始历史记录访问服务
[root@hadoop-master hadoop-2.5.0]# sbin/mr-jobhistory-daemon.sh start historyserver
#访问http://192.168.1.200:19888端口可以看到日志聚合功能开启的结果
#将如下配置加入到yarn-site.xml 开启日志功能
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>106800</value>
</property>
#然后重启hadoop服务
#[root@hadoop-master hadoop-2.5.0]# killall java
#重新启动流程
[root@hadoop-master hadoop-2.5.0]# sbin/hadoop-daemon.sh start namenode
starting namenode, logging to /data/modules/hadoop-2.5.0/logs/hadoop-root-namenode-hadoop-master.out
[root@hadoop-master hadoop-2.5.0]# sbin/hadoop-daemon.sh start datanode
starting datanode, logging to /data/modules/hadoop-2.5.0/logs/hadoop-root-datanode-hadoop-master.out
[root@hadoop-master hadoop-2.5.0]# sbin/yarn-daemon.sh start resourcemanager
starting resourcemanager, logging to /data/modules/hadoop-2.5.0/logs/yarn-root-resourcemanager-hadoop-master.out
[root@hadoop-master hadoop-2.5.0]# sbin/yarn-daemon.sh start nodemanager
starting nodemanager, logging to /data/modules/hadoop-2.5.0/logs/yarn-root-nodemanager-hadoop-master.out
[root@hadoop-master hadoop-2.5.0]# sbin/mr-jobhistory-daemon.sh start historyserver
starting historyserver, logging to /data/modules/hadoop-2.5.0/logs/mapred-root-historyserver-hadoop-master.out
[root@hadoop-master hadoop-2.5.0]# jps
17632 Jps
17442 NodeManager
17196 ResourceManager
17599 JobHistoryServer
17103 DataNode
17023 NameNode
#######################
#SSH配置免密钥
#######################
#确保你当前所在的目录是家目录
cd ~
输入如下命令
[root@hadoop-master ~]# ssh-keygen -t rsa
#然后回车3次即可
#生成的文件发送给自己即可
[root@hadoop-master ~]# ssh-copy-id hadoop-master
#使用ssh命令进行验证
[root@hadoop-master ~]# ssh hadoop-master
#接下来我们使用
start-dfs.sh
#来验证刚才配置的面密钥是否已经成功
#######################
#搭建windows hadoop开发环境
#######################
#blog地址
https://blog.csdn.net/lsr40/article/details/77868113
#解压hadoop 2.5.0
#配置hadoop home
D:\hadoop\hadoop-2.5.0
#在POM中加入
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.5.0</version>
</dependency>
#将
core-site.xml
hdfs-site.xml
log4j.properties
#拷入resources文件夹下
#启动程序后,会报如下异常
Permission denied: user=gy, access=WRITE, inode="/":root:supergroup:drwxr-xr-x
#在hdfs-stie.xml中加入如下配置
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
#配置完毕之后,重启hdfs
#将本地的程序用maven打包
#一开始出现如下的错误
[root@hadoop-master hadoop-2.5.0]# bin/yarn jar /data/jar/hadoop-1.0.0.jar
RunJar jarFile [mainClass] args...
#打包的时候没有指定主类
#在POM中加入
<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<appendAssemblyId>false</appendAssemblyId>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
<mainClass>org.lynn.MR</mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>assembly</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
#然后重新打包
[root@hadoop-master hadoop-2.5.0]# bin/yarn jar /data/jar/hadoop-1.0.3.jar
程序可以运行
#combiner 优化
#combiner 是单个map的reduce
#reduce是多个map,是必不可少的阶段
#####################
###完全分布式搭建####
#####################
-------部署规划-------
hadoop | hadooop | hadoooop
namenode |resourcemanager |secondarynamenode
datanode |datanode |datanode
nodemanager |nodemanager |nodemanager
historyserver| |
#三台机器以root用户 为/data/modules/app目录 进行安装
#所有操作都已hadoop-master为样例
#解压hadoop的压缩包到当前路径
[root@hadoop-master app]# tar -zxvf /data/softwares/hadoop-2.5.0.tar.gz -C .
#查看当前主机是否安装了Java
[root@hadoop-master app]# echo $JAVA_HOME
/data/modules/jdk
#编辑hadoop-env.sh
#替换JAVA_HOME
export JAVA_HOME=${JAVA_HOME}
为
export JAVA_HOME=/data/modules/jdk
#替换mapred-env.sh
# export JAVA_HOME=/home/y/libexec/jdk1.6.0/
为
export JAVA_HOME=/data/modules/jdk
#替换yarn-env.sh
if [ "$JAVA_HOME" != "" ]; then
#echo "run java in $JAVA_HOME"
JAVA_HOME=$JAVA_HOME
fi
为
JAVA_HOME=/data/modules/jdk
#修改core-site.xml
<property>
<name>fs.defaultFS</name>
<value>hdfs://hadoop-master:8020</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/data/modules/app/hadoop-2.5.0/hadoop-data/tmp</value>
</property>
#修改hdfs-site.xml
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>hadoooop:50090</value>
</property>
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
#修改slaves
hadoop-master
hadooop
hadoooop
#修改mapred-site.xml
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>hadoop-master:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>hadoop-master:19888</value>
</property>
#修改yarn-site.xml
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>hadooop</value>
</property>
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>106800</value>
</property>
#设置时间服务器
[root@hadoop-master tmp]# service ntpd status
#在三台机器上同时开启时间服务器
[root@hadoop-master tmp]# service ntpd start
Starting ntpd: [ OK ]
#设置开机自动启动
[root@hadoooop app]# chkconfig ntpd on
#修改 /etc/ntp.conf
17 # Hosts on local network are less restricted.
18 #restrict 192.168.1.0 mask 255.255.255.0 nomodify notrap
修改为自己的网段
#在内网使用时,不需要外网的服务器,注释掉22-25行
20 # Use public servers from the pool.ntp.org project.
21 # Please consider joining the pool (http://www.pool.ntp.org/join.html).
22 #server 0.centos.pool.ntp.org iburst
23 #server 1.centos.pool.ntp.org iburst
24 #server 2.centos.pool.ntp.org iburst
25 #server 3.centos.pool.ntp.org iburst
#添加本地时间同步
34 #if time server not avaiable,use local time
35 server 127.127.1.0 # local clock
36 fudge 127.127.1.0 stratum 10
#保存之后,重新启动服务
[root@hadoop-master tmp]# service ntpd restart
Shutting down ntpd: [ OK ]
Starting ntpd: [ OK ]
#查看本机是否安装了ntp
[root@hadoop-master ~]# rpm -qa | grep ntp
ntp-4.2.6p5-10.el6.centos.x86_64
fontpackages-filesystem-1.41-1.1.el6.noarch
ntpdate-4.2.6p5-10.el6.centos.x86_64
#编辑/etc/hosts
192.168.1.200 hadoop-master
192.168.1.201 hadooop
192.168.1.202 hadoooop
#是用来设置本地日期和时间
[root@hadoop-master ~]# ntpdate hadoop-master
2 Jun 11:41:35 ntpdate[83590]: the NTP socket is in use, exiting
#停止ntpd
[root@hadoop-master ~]# service ntpd stop
Shutting down ntpd: [ OK ]
#编写时间同步脚本
#下面的需要在同步的节点上编辑
crontab -e
##sync time
0-59/10 * * * * /usr/sbin/ntpdate hadoop-master
#然后将3台机器的时间同步,可以利用xshell的群发功能来做个这个操作
date -s "2018-6-2 11:51"
#格式化namenode
[root@hadoop-master hadoop-2.5.0]# bin/hdfs namenode -format
#将配置分发给远程节点
[root@hadoop-master app]# scp -r hadoop-2.5.0/ root@hadooop:/data/modules/app
[root@hadoop-master app]# scp -r hadoop-2.5.0/ root@hadoooop:/data/modules/app
#配置免密钥登录 在home目录中
[root@hadooop ~]# rm -rf .ssh/
[root@hadoooop ~]# rm -rf .ssh/
#三台机器生成密钥 在xshell全部会话
[root@hadoop-master ~]# ssh-keygen -t rsa
#将密钥发送给别人 包括自己
#第一台
[root@hadoop-master ~]# ssh-copy-id hadoop-master
[root@hadoop-master ~]# ssh-copy-id hadooop
[root@hadoop-master ~]# ssh-copy-id hadoooop
#第二台
[root@hadoop-master ~]# ssh-copy-id hadooop
#第三台
[root@hadoop-master ~]# ssh-copy-id hadoooop
#接下来 验证一下 发现已经不需要输入免密钥的验证
#如果面密钥配置正确,则不会要求再次输入密码
#然后发现,另外的两台机器没有安装jdk ,同样我们来安装一下
[root@hadoop-master hadoop-2.5.0]# sbin/start-dfs.sh
#如果一切正常 我们现在第一台机器上查看 jps
#第一台机器
[root@hadoop-master hadoop-2.5.0]# jps
3957 DataNode
3865 NameNode
4169 Jps
#第二台机器
[root@hadooop modules]# jps
28920 Jps
28842 DataNode
#第三台机器
[root@hadoooop modules]# jps
28904 SecondaryNameNode
28988 Jps
28846 DataNode
#启动yarn
[root@hadoop-master hadoop-2.5.0]# sbin/start-yarn.sh
#启动之后,jps我们发现,yarn并没有启动,因为在我们的架构中,resourcemanager存在第二台机器上
#所以我们需要在第二台机器上启动
[root@hadooop hadoop-2.5.0]# sbin/start-yarn.sh
#好了,之后我们可以启动webUI来查启动的live nodes
#这个时候我们在集群中的任意一个节点发送文件,都可以在集群中看到
#然后我们在集群中跑一个任务开始测试
[root@hadoooop hadoop-2.5.0]# bin/yarn jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.5.0.jar wordcount /cluster-demo/1.txt /cluster-demo/result
#同时停止所有节点
[root@hadoop-master hadoop-2.5.0]# sbin/stop-all.sh
#resourcemanager需要在开启的机器上的单独停止
[root@hadooop sbin]# ./stop-yarn.sh
#####################
###完全分布式之HA####
#####################
#配置zookeeper
#修改zoo.cfg
dataDir=/data/modules/zookeeper-3.4.6/zkData
#这样zk的standalone的模式就配置好了
[root@hadoop-master zookeeper-3.4.6]# bin/zkServer.sh start
JMX enabled by default
Using config: /data/modules/zookeeper-3.4.6/bin/../conf/zoo.cfg
Starting zookeeper ... STARTED
[root@hadoop-master zookeeper-3.4.6]# bin/zkServer.sh status
JMX enabled by default
Using config: /data/modules/zookeeper-3.4.6/bin/../conf/zoo.cfg
Mode: standalone
#配置zookeeper的完全分布式
server.1=hadoop-master:2888:3888
server.2=hadooop:2888:3888
server.3=hadoooop:2888:3888
#在dataDir下创建myid文件 myid中的数字跟上面机器上对应的节点要一样
#修改HA的hadoop配置
#hdfs-site.xml
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
<name>dfs.permissions.enabled</name>
<value>false</value>
</property>
<!-- HA -->
<property>
<name>dfs.nameservices</name>
<value>mycluster</value>
</property>
<property>
<name>dfs.ha.namenodes.mycluster</name>
<value>nn1,nn2</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn1</name>
<value>hadoop-master:8020</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn2</name>
<value>hadoooop:8020</value>
</property>
<property>
<name>dfs.namenode.http-address.mycluster.nn1</name>
<value>hadoop-master:50070</value>
</property>
<property>
<name>dfs.namenode.http-address.mycluster.nn2</name>
<value>hadoooop:50070</value>
</property>
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://hadoop-master:8485;hadooop:8485;hadoooop:8485/mycluster</value>
</property>
<property>
<name>dfs.client.failover.proxy.provider.mycluster</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<property>
<name>dfs.ha.fencing.methods</name>
<value>shell(/bin/true)</value>
</property>
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/data/modules/app/hadoop-2.5.0/hadoop-data/journal</value>
</property>
#core-site.xml
<property>
<name>fs.defaultFS</name>
<value>hdfs://mycluster</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/data/modules/hadoop-2.5.0/hadoop-data/tmp</value>
</property>
<property>
<name>hadoop.http.staticuser.user</name>
<value>root</value>
</property>
<property>
<name>ha.zookeeper.quorum</name>
<value>hadoop-master:2181,hadooop:2181,hadoooop:2181</value>
</property>
#mapred-site.xml
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>hadoop-master:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>hadoop-master:19888</value>
</property>
#yarn-site.xml
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>hadooop</value>
</property>
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>106800</value>
</property>
#HA的启动方式
#要先启动zookeeper
./zkServer.sh start
#然后启动jounalnode
[root@hadoop-master sbin]# ./hadoop-daemon.sh start journalnode
#然后格式化第一台namenode,然后第二台去同步
[root@hadoop-master hadoop-2.5.0]# bin/hdfs namenode -format
#启动刚才格式化好的namenode
[root@hadoop-master hadoop-2.5.0]# sbin/hadoop-daemon.sh start namenode
#来到第二台namendoe,同步刚才的namenode信息
[root@hadoooop hadoop-2.5.0]# bin/hdfs namenode -bootstrapStandby
#启动第二台机器的namendoe
[root@hadoooop hadoop-2.5.0]# sbin/hadoop-daemon.sh start namenode
#这样启动两台namenode之后,两台namenode都是standby状态,需要手动激活
#首先查看两台机器的状态
[root@hadoop-master hadoop-2.5.0]# bin/hdfs haadmin -getServiceState nn1
18/06/04 22:31:49 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
standby
[root@hadoop-master hadoop-2.5.0]# bin/hdfs haadmin -getServiceState nn2
18/06/04 22:32:01 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
standby
#发现两台机器都是standby状态,我们手动激活一下
[root@hadoop-master hadoop-2.5.0]# bin/hdfs haadmin -transitionToActive nn1
#如果成功的化,我们的手动故障转移已经配置成功了,接下来,我们来配置自动故障转移
#首先停止全部节点
#然后在hdfs-site.xml中加入如下配置
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
#然后需要format zkfc
[root@hadoop-master hadoop-2.5.0]# bin/hdfs zkfc -formatZK
#然后从新开启Hadoop
[root@hadoop-master hadoop-2.5.0]# sbin/start-dfs.sh
#启动之后,发现第一台机器的namenode无法正常启动
#尝试在hdfs-site.xml中加入如下配置
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/root/.ssh/id_rsa</value>
</property>
<property>
<name>dfs.ha.fencing.ssh.connect-timeout</name>
<value>30000</value>
</property>
#我们先来停止全部服务
[root@hadoop-master hadoop-2.5.0]# sbin/stop-all.sh
#先使用zkClien删掉hadoop-ha
rmr /hadoop-ha
#再次重新格式化zkfc
[root@hadoop-master hadoop-2.5.0]# bin/hdfs zkfc -formatZK
#再次重新启动Hadoop
[root@hadoop-master hadoop-2.5.0]# sbin/start-dfs.sh
#接下来来做HA自动故障转移的测试
#杀掉第一台机器的namenode
#此时,如果第二台namenode能够自动切换为active,则说明Hadoop的HA配置成功
#小结
#启动hadoop和yarn的两条命令
sbin/start-dfs.sh
sbin/start-yarn.sh
#namenode的访问地址
192.168.1.200:50070
#yarn服务访问地址
http://192.168.1.201:8088
#jobhistory的访问地址
http://192.168.1.200:19888/jobhistory