Hadoop 伪分布式安装
#下载tar包http://archive.cloudera.com/cdh5/cdh/5/
wget http://archive.cloudera.com/cdh5/cdh/5/hadoop-2.6.0-cdh5.7.0.tar.gz
# 新建一个专门管理hadoop的用户
useradd hadoop
su - root
# 安装java,需要安装java在/usr/java目录下,因为cdh默认jdk路径在/usr/java
ln -s /jdk1.8.0_212 /usr/java/java8vi
vi /etc/profile
export JAVA_HOME=/usr/java/java8
export PATH=$JAVA_HOME/bin:$PATH
source /etc/profile
# 解压hadoop
tar -zxvf hadoop-2.6.0-cdh5.7.0.tar.gz
mv ./hadoop-2.6.0-cdh5.7.0 /
ln -s /hadoop-2.6.0-cdh5.7.0 /hadoop
# 权限
chown -R hadoop:hadoop /hadoop-2.6.0-cdh5.7.0/
chown -R hadoop:hadoop /hadoop
# 配置环境变量
vi /etc/profile
export HADOOP_HOME=/hadoop
export PATH=$HADOOP_HOME/bin:$PATH
source /etc/profile
# 配置hadoop
cd /hadoop/etc/hadoop
# 删除无用的文件
rm -rf *.cmd
vi hadoop-env.sh
export JAVA_HOME=/usr/java/java8
hadoop 运行的三种模式
- Local (Standalone) Mode 独立模式(本地运行,利于debug)
- Pseudo-Distributed Mode 伪分布式
- Fully-Distributed Mode 集群
- 配置伪分布式
vi core-site.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://localhost:9000</value>
</property>
</configuration>
vi hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>1</value>
</property>
</configuration>
- 配置免密登录
# 尝试ssh,发现是需要输入密码的,不是免密模式
ssh localhost
su - hadoop
# ssh-keygen然后一路enter
ssh-keygen
cd .ssh
cat id_rsa.pub >> authorized_keys
chmod 600 authorized_keys
# 测试
ssh localhost
输入yes,将localhost加入到known_hosts列表中,即可实现免密登录
# 格式化hadoop
# 格式化后出现has been successfully formatted.即为格式化成功
cd /hadoop
bin/hdfs namenode -format
# 启动namenode和datanode,期间会要求输入一次yes
sbin/start-dfs.sh
# 查看状态 有SecondaryNameNode,NameNode,DataNode三个进程就是成功启动了
ps -ef|grep hadoop
# 开放端口,云主机需要配置安全组规则
firewall-cmd --zone=public --add-port=50070/tcp --permanent
systemctl restart firewalld.service
# 查看web界面50070,即可查看到hadoop的相关情况
http://ip:50070
- 运行HelloWorld
# 创建hdfs文件夹
bin/hdfs dfs -mkdir /user
bin/hdfs dfs -mkdir /user/hadoop
# 传送样例文件到hdfs
bin/hdfs dfs -put etc/hadoop input
# 运行样例jar,过滤input中dfs开头的文件到output
bin/hadoop jar share/hadoop/mapreduce/hadoop-mapreduce-examples-2.6.0-cdh5.7.0.jar grep input output 'dfs[a-z.]+'
# 运行完之后会生成output文件夹,存放了_SUCCESS和结果文件,_SUCCESS标志着任务运行成功
# 查看输出结果
bin/hdfs dfs -cat output/*
# 停止hdfs
sbin/stop-dfs.sh