Hadoop伪集群部署
Hadoop是一个由Apache基金会所开发的分布式系统基础架构。用户可以在不了解分布式底层细节的情况下,开发分布式程序。充分利用集群的威力进行高速运算和存储。Hadoop实现了一个分布式文件系统( Distributed File System),其中一个组件是HDFS(Hadoop Distributed File System)。HDFS有高容错性的特点,并且设计用来部署在低廉的(low-cost)硬件上;而且它提供高吞吐量(high throughput)来访问应用程序的数据,适合那些有着超大数据集(large data set)的应用程序。HDFS放宽了(relax)POSIX的要求,可以以流的形式访问(streaming access)文件系统中的数据。Hadoop的框架最核心的设计就是:HDFS和MapReduce。HDFS为海量的数据提供了存储,而MapReduce则为海量的数据提供了计算 。
1.创建两个文件docker-compose.yml和hadoop.env
# 单datanode Docker-compose.yml内容如下:
version: "3.3"
services:
namenode:
image: bde2020/hadoop-namenode:1.1.0-hadoop2.7.1-java8
container_name: namenode
hostname: namenode
volumes:
- hadoop_namenode:/hadoop/dfs/name
environment:
- CLUSTER_NAME=test
env_file:
- ./hadoop.env
ports:
- "50070:50070"
- "8020:8020"
resourcemanager:
image: bde2020/hadoop-resourcemanager:1.1.0-hadoop2.7.1-java8
container_name: resourcemanager
hostname: resourcemanager
depends_on:
- namenode
- datanode1
env_file:
- ./hadoop.env
ports:
- "8088:8088"
historyserver:
image: bde2020/hadoop-historyserver:1.1.0-hadoop2.7.1-java8
container_name: historyserver
hostname: historyserver
depends_on:
- namenode
- datanode1
volumes:
- hadoop_historyserver:/hadoop/yarn/timeline
env_file:
- ./hadoop.env
ports:
- "8188:8188"
nodemanager1:
image: bde2020/hadoop-nodemanager:1.1.0-hadoop2.7.1-java8
container_name: nodemanager1
hostname: nodemanager1
depends_on:
- namenode
- datanode1
env_file:
- ./hadoop.env
ports:
- "8042:8042"
datanode1:
image: bde2020/hadoop-datanode:1.1.0-hadoop2.7.1-java8
container_name: datanode1
hostname: datanode1
ports:
- "50075:50075"
depends_on:
- namenode
volumes:
- hadoop_datanode1:/hadoop/dfs/data
env_file:
- ./hadoop.env
volumes:
hadoop_namenode:
hadoop_datanode1:
hadoop_historyserver:
# 多datanode docker-compose.yml如下:
version: "3.3"
services:
namenode:
image: bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8
# 配置好 docker 内的假域名
hostname: namenode
container_name: namenode
ports:
- 9000:9000
- 50070:50070
restart: always
# 此处使用 network的模式 是host,意思是基于本地的hosts文件内容,此处用处在于和spark集群集成,以及便于外部访问随机生成的端口号
network_mode: 'host'
volumes:
- ./hadoop/namenode:/hadoop/dfs/name
- ./hadoop/input_files:/input_files
# environment参数可以添加在 env_file 中
environment:
- CLUSTER_NAME=test
# 配置 hdfs 用户权限问题,不需要只允许 hadoop 用户访问
- HDFS_CONF_dfs_permissions=false
# env_file中的参数可以配置在env_file
env_file:
- ./hadoop.env
resourcemanager:
image: bde2020/hadoop-resourcemanager:2.0.0-hadoop3.2.1-java8
hostname: resourcemanager
container_name: resourcemanager
ports:
- 8030:8030
- 8031:8031
- 8032:8032
- 8033:8033
- 8088:8088
restart: always
network_mode: 'host'
depends_on:
- namenode
- datanode1
- datanode2
env_file:
- ./hadoop.env
historyserver:
image: bde2020/hadoop-historyserver:2.0.0-hadoop3.2.1-java8
hostname: historyserver
container_name: historyserver
ports:
- 8188:8188
restart: always
network_mode: 'host'
depends_on:
- namenode
- datanode1
- datanode2
volumes:
- ./hadoop/historyserver:/hadoop/yarn/timeline
env_file:
- ./hadoop.env
nodemanager1:
image: bde2020/hadoop-nodemanager:2.0.0-hadoop3.2.1-java8
hostname: nodemanager1
container_name: nodemanager1
ports:
- 8040:8040
- 8041:8041
- 8042:8042
restart: always
network_mode: 'host'
depends_on:
- namenode
- datanode1
- datanode2
env_file:
- ./hadoop.env
datanode1:
image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
hostname: datanode1
container_name: datanode1
restart: always
network_mode: 'host'
environment:
# 等价于在 hdfs-site.xml 中配置 dfs.datanode.address
- HDFS_CONF_dfs_datanode_address=0.0.0.0:50010
# dfs.datanode.ipc.address 不使用默认端口的意义是在同一机器起多个 datanode,暴露端口需要不同
- HDFS_CONF_dfs_datanode_ipc_address=0.0.0.0:50020
# dfs.datanode.http.address
- HDFS_CONF_dfs_datanode_http_address=0.0.0.0:50075
ports:
- 50010:50010
- 50020:50020
- 50075:50075
depends_on:
- namenode
volumes:
- ./hadoop/datanode1:/hadoop/dfs/data
env_file:
- ./hadoop.env
datanode2:
image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
hostname: datanode2
container_name: datanode2
restart: always
network_mode: 'host'
environment:
- HDFS_CONF_dfs_datanode_address=0.0.0.0:50012
- HDFS_CONF_dfs_datanode_ipc_address=0.0.0.0:50022
- HDFS_CONF_dfs_datanode_http_address=0.0.0.0:50072
ports:
- 50012:50012
- 50022:50022
- 50072:50072
depends_on:
- namenode
volumes:
- ./hadoop/datanode2:/hadoop/dfs/data
env_file:
- ./hadoop.env
datanode3:
image: bde2020/hadoop-datanode:2.0.0-hadoop3.2.1-java8
hostname: datanode3
container_name: datanode3
restart: always
network_mode: 'host'
environment:
- HDFS_CONF_dfs_datanode_address=0.0.0.0:50013
- HDFS_CONF_dfs_datanode_ipc_address=0.0.0.0:50023
- HDFS_CONF_dfs_datanode_http_address=0.0.0.0:50073
ports:
- 50013:50013
- 50023:50023
- 50073:50073
depends_on:
- namenode
volumes:
- ./hadoop/datanode3:/hadoop/dfs/data
env_file:
- ./hadoop.env
# hadoop.env环境配置如下:
CORE_CONF_fs_defaultFS=hdfs://namenode:8020
CORE_CONF_hadoop_http_staticuser_user=root
CORE_CONF_hadoop_proxyuser_hue_hosts=*
CORE_CONF_hadoop_proxyuser_hue_groups=*
HDFS_CONF_dfs_webhdfs_enabled=true
HDFS_CONF_dfs_permissions_enabled=false
YARN_CONF_yarn_log___aggregation___enable=true
YARN_CONF_yarn_resourcemanager_recovery_enabled=true
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
YARN_CONF_yarn_timeline___service_enabled=true
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
YARN_CONF_yarn_timeline___service_hostname=historyserver
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
YARN_CONF_yarn_resourcemanager_resource___tracker_address=resourcemanager:8031
# 多datanode节点
# 添加host属性时,network_mode: 'host',更改系统/etc/host文件
0.0.0.0 namenode
0.0.0.0 resourcemanager
0.0.0.0 historyserver
0.0.0.0 nodemanager1
0.0.0.0 datanode1
0.0.0.0 datanode2
0.0.0.0 datanode3
# Linux环境Jdk配置
cd /etc/profile
export PATH=$JAVA_HOME:/usr/local/bin:$PATH
# 重启系统环境配置
source /etc/profile
# 访问
http://namenode:50070/dfshealth.html#tab-overview
http://historyserver:8188/applicationhistory
http://resourcemanager:8088/cluster/apps/RUNNING
http://nodemanager1:8042/node
http://datanode:50075/
http://datanode1:50072/
http://datanode2:50073/
问题
# 1、Failed to Setup IP tables: Unable to enable SKIP DNAT rule: (iptables failed: iptables --wait -t nat -I DOCKER -i br-5ebdd9daf7e6 -j RETURN: iptables: No chain/target/match by that name.
网上查找发现,可能是网络问题造成
首先先验证docker容器内部网络是否能ping通宿主机
如果能ping通,即可通过重建docker0网络恢复
先停掉宿主机上运行的docker容器,然后执行以下命令
在宿主机执行:
pkill docker
iptables -t nat -F
ifconfig docker0 down
brctl delbr docker0
docker -d
systmctl restart docker重启docker服务
问题即可解决。。
# 2、Hadoop 3.2.1版本 namedata 50070端口更改为9870