hadoop(二):配置与简单使用
path
$hadoop_HOME/etc/hadoop
core-site.xml
<configuration>
<!-- HDFS(分布式文件储存系统)的NameService,是NameNode的URL -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://cluster1</value>
</property>
<!-- 用于序列文件缓冲区的大小,决定读写操作期间缓存了多少数据 -->
<property>
<name>io.file.buffer.size</name>
<value>131072</value>
</property>
<!-- hadoop临时文件目录 -->
<property>
<name>hadoop.tmp.dir</name>
<value>/data/tmp/hadoop</value>
</property>
<!-- 指定可以在任何ip访问 -->
<property>
<name>hadoop.proxyuser.hadoop.hosts</name>
<value>*</value>
</property>
<!-- 指定所有账号可以访问 -->
<property>
<name>hadoop.proxyuser.hadoop.groups</name>
<value>*</value>
</property>
<!-- 指定zookeeper地址及端口 -->
<property>
<name>ha.zookeeper.quorum</name>
<value>dn1:2181,dn2:2181,dn3:2181</value>
</property>
<!-- text
<property>
<name></name>
<value></value>
</property>
-->
</configuration>
hdfs-site.xml
<configuration>
<!-- 指定HDFS的NameService,与core-site.xml保持一致 -->
<property>
<name>dfs.nameservices</name>
<value>cluster1</value>
</property>
<!-- 设置cluster1下面的NameNode -->
<property>
<name>dfs.ha.namenodes.cluster1</name>
<value>nna,nns</value>
</property>
<!-- RPC通讯地址 -->
<property>
<name>dfs.namenode.rpc-address.cluster1.nna</name>
<value>nna:9000</value>
</property>
<property>
<name>dfs.namenode.rpc-address.cluster1.nns</name>
<value>nns:9000</value>
</property>
<!-- 指定HTTP通讯地址 -->
<property>
<name>dfs.namenode.http-address.cluster1.nna</name>
<value>nna:50070</value>
</property>
<property>
<name>dfs.namenode.http-address.cluster1.nns</name>
<value>nns:50070</value>
</property>
<!-- 指定NameNode的元数据在JournamNode上的存放位置 -->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>
qjournal://dn1:8485;dn2:8485;dn3:8485/cluster1
</value>
</property>
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/data/tmp/journal</value>
</property>
<!-- 配置失败自动切换实现方式 -->
<property>
<name>dfs.client.failover.proxy.provider.cluster1</name>
<value>
org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
</value>
</property>
<!-- 配置隔离机制 -->
<property>
<name>dfs.ha.fencing.methods</name>
<value>sshfence</value>
</property>
<!-- 使用隔离机制是需要ssh免密 -->
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/home/hadoop/.ssh/id_rea</value>
</property>
<!-- 指定支持高可用自动切换机制 -->
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
<!-- NameNode名称空间存储地址 -->
<property>
<name>dfs.namenode.name.dir</name>
<value>/data/tmp/dfs/name</value>
</property>
<!-- DataNode数据存储地址 -->
<property>
<name>dfs.datanode.data.dir</name>
<value>/data/tmp/dfs/data</value>
</property>
<!-- 数据冗余分数 -->
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<!-- 是否可通过web访问的HDFS目录 -->
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
<!-- 保证数据恢复,通过0.0.0.0保证,则可以兼顾内外网访问 -->
<property>
<name>dfs.journalnode.rpc-address</name>
<value>0.0.0.0:8485</value>
</property>
<property>
<name>dfs.journalnode.http-address</name>
<value>0.0.0.0:8480</value>
</property>
<!-- 通过ZKFailoverController来实现自动故障切换 -->
<property>
<name>ha.zookeeper.quorum</name>
<value>dn1:2181,dn2:2181,dn3:2181</value>
</property>
<!-- text
<property>
<name></name>
<value></value>
</property>
-->
</configuration>
mapred-site.xml
<configuration>
<!-- 计算任务托管的资源管理名称 -->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<!-- 配置MapReduce JobHistory Server 地址,默认端口10020 -->
<property>
<name>mapreduce.jobhistory.address</name>
<value>0.0.0.0:10020</value>
</property>
<!-- 配置MapReduce JobHistory Server Web 地址,默认端口19888 -->
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>0.0.0.0:19888</value>
</property>
<!-- text
<property>
<name></name>
<value></value>
</property>
-->
</configuration>
yarn-site.xml
<configuration>
<!-- RM(Resource Manager)失联后重新连接的时间 -->
<property>
<name>yarn.resourcemanager.connect.retry-interval.ms</name>
<value>2000</value>
</property>
<!-- 开启RM HA,默认为false -->
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<!-- 配置RM -->
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
</property>
<property>
<name>ha.zookeeper.quorum</name>
<value>dn1:2181,dn2:2181,dn3:2181</value>
</property>
<!-- 开启故障自动切换 -->
<property>
<name>yarn.resourcemanager.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
<!-- rm1配置开始 -->
<!-- 配置Resource Manager 主机别名rm1 角色为NameNode Active -->
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>nna</value>
</property>
<!-- 配置Resource Manager 主机别名rm2 角色为NameNode Active -->
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>nns</value>
</property>
<!-- nna配置rm1,nns配置rm2 -->
<property>
<name>yarn.resourcemanager.ha.id</name>
<value>rm1</value>
</property>
<!-- 开启自动恢复功能 -->
<property>
<name>yarn.resourcemanager.recovery.enabled</name>
<value>true</value>
</property>
<!-- 配置与zookeeper的连接地址 -->
<property>
<name>yarn.resourcemanager.zk-state-store.address</name>
<value>dn1:2181,dn2:2181,dn3:2181</value>
</property>
<!-- 用于持久化RM实现,基于zookeeper实现 -->
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
</property>
<!-- zookeeper地址用于RM实现状态存储,以及HA的设置 -->
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>dn1:2181,dn2:2181,dn3:2181</value>
</property>
<!-- 集群ID标识 -->
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>cluster1-yarn</value>
</property>
<!-- schelduler失联等待连接时间 -->
<property>
<name>yarn.app.mapreduce.am.scheduler.connection.wait.interval-ms</name>
<value>5000</value>
</property>
<!-- 不同的RM主机配置=================================== -->
<!-- 配置rm1,其应用访问管理接口 -->
<property>
<name>yarn.resourcemanager.address.rm1</name>
<value>nna:8132</value>
</property>
<!-- 调度接口地址 -->
<property>
<name>yarn.resourcemanager.scheduler.address</name>
<value>nna:8130</value>
</property>
<!-- RM的Web访问地址 -->
<property>
<name>yarn.resourcemanager.webapp.address.rm1</name>
<value>nna:8188</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tarcker.address.rm1</name>
<value>nna:8131</value>
</property>
<!-- RM管理员接口地址 -->
<property>
<name>yarn.resourcemanager.admin.address.rm1</name>
<value>nna:8033</value>
</property>
<property>
<name>yarn.resourcemanager.admin.ha.address.rm1</name>
<value>nna:23142</value>
</property>
<!-- 不同的RM主机配置=================================== -->
<!-- rm1配置结束 -->
<!-- rm2配置开始 -->
<!-- 不同的RM主机配置=================================== -->
<!-- 配置rm1,其应用访问管理接口 -->
<property>
<name>yarn.resourcemanager.address.rm2</name>
<value>nns:8132</value>
</property>
<!-- 调度接口地址 -->
<property>
<name>yarn.resourcemanager.scheduler.address</name>
<value>nns:8130</value>
</property>
<!-- RM的Web访问地址 -->
<property>
<name>yarn.resourcemanager.webapp.address.rm2</name>
<value>nns:8188</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tarcker.address.rm2</name>
<value>nns:8131</value>
</property>
<!-- RM管理员接口地址 -->
<property>
<name>yarn.resourcemanager.admin.address.rm2</name>
<value>nns:8033</value>
</property>
<property>
<name>yarn.resourcemanager.admin.ha.address.rm2</name>
<value>nns:23142</value>
</property>
<!-- 不同的RM主机配置=================================== -->
<!-- rm2配置结束 -->
<!-- NM(NodeManager)的附属服务,需要设置成mapreduce_shuffle才能运行MapReduce任务 -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<!-- 配置shuffle处理类 -->
<property>
<name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<!-- NM本地文件路径 -->
<property>
<name>yarn.nodemanager.local-dirs</name>
<value>/data/tmp/yarn/local</value>
</property>
<!-- NM日志存放地址 -->
<property>
<name>yarn.nodemanager.log-dirs</name>
<value>/data/tmp/log/yarn</value>
</property>
<!-- ShuffleHandler 运行服务端口,用于 Map 结果输出到请求 Reducer -->
<property>
<name>mapreduce.shuffle.port</name>
<value>23080</value>
</property>
<!-- 故障处理类 -->
<property>
<name>yarη.client.failover.proxy.provider</name>
<value>org.apache.hadoop.yarn.client.ConfigureRMFailoverProxyProvider</value>
</property>
<!-- 故障自动转移的zookeeper路径地址 -->
<property>
<name>yarn.resourcemanager.ha.automatic-failover.zk-base-path</name>
<value>yarn-leader-election</value>
</property>
<!-- 查看任务调度进度,在nns节点上需要将访问地址修改为http://nns:9001 -->
<property>
<name>mapreduce.jobtracker.address</name>
<value>http://nna:9001</value>
</property>
<!-- 启动聚合操作日志 -->
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<!-- 指定日志在HDFS上的路径 -->
<property>
<name>yean.nodemanager.remote-app-log-dir</name>
<value>/data/tmp/dfs/logs</value>
</property>
<property>
<name>yean.nodemanager.remote-app-log-dir-suffix</name>
<value>logs</value>
</property>
<!-- 聚合后的日志在HDFS上保存多长时间,单位为秒 -->
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>259200</value>
</property>
<!-- 删除任务的HDFS上执行的间隔,执行时候将满足掉件的日志删除 -->
<property>
<name>yarn.log-aggregation.retain-check-interval-seconds</name>
<value>3600</value>
</property>
<!-- RM浏览器代理端口 -->
<property>
<name>yarn.web-proxy.address</name>
<value>nna:8090</value>
</property>
<!-- 配置Fair调度策略 -->
<property>
<description>
CLASSPATH for YARN applications . A comma-separated list of CLASSPATH entries. When th value is empty , the following default
CLASSPATH for YARN applications would be used .
For Linux:
$HADOOP_CONF DIR,
$HADOOP_COMMON_HOME/share/hadoop/common/*,
$HADOOP COMMON HOME/share/hadoop/common/l/*,
$HADOOP_HDFS_HOME/share/hadoop/hdfs*,
$HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*,
$HADOOP_YARN_HOME/share/hadoop/yarn/*,
$HADOOP_YARN_HOME/share/hadoop/yarn/lib/*
</description>
<name>yarn.application.classpath</name>
<value>
/data/soft/new/hadoop/share/hadoop/common/*,
/data/soft/new/hadoop/share/hadoop/common/lib/*,
/data/soft/new/hadoop/share/hadoop/hdfs/*,
/data/soft/new/hadoop/share/hadoop/hdfs/lib/*,
/data/soft/new/hadoop/share/hadoop/yarn/*,
/data/soft/new/hadoop/share/hadoop/yarn/lib/*
</value>
</property>
<!-- 配置 Fair 调度策略指定类 -->
<property>
<name>yarn.resourcemanager.scheduler.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
</property>
<!-- 启动RM系统监控 -->
<property>
<name>yarn.resourcemanager.system-metrics-publisher.enabled</name>
<value>true</value>
</property>
<!-- 指定调度策略配置文件 -->
<property>
<name>yarn.scheduler.fair.allocation.file</name>
<value>/data/soft/hadoop-3.2.0/etc/hadoop/fair-scheduler.xml</value>
</property>
<!-- 每个NodeManager节点分贝没存大小 -->
<property>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>1024</value>
</property>
<!-- 每个NodeManager节点分配的cpu核数 -->
<property>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>1</value>
</property>
<!-- 物理内存和虚拟内存比率 -->
<property>
<name>yarn.nodemanager.vmem-pmem-ratio</name>
<value>4.2</value>
</property>
<!-- text
<property>
<name></name>
<value></value>
</property>
-->
</configuration>
fair-scheduler.xml
<?xml version="1.0"?>
<allocations>
<queue name="root">
<!-- 默认队列 -->
<queue name="default">
<!-- 允许最大App运行数 -->
<maxRunningApps>10</maxRunningApps>
<!-- 分配最小内存和CPU -->
<minResources>1024mb,1vcores</minResources>
<!-- 分配最大内存和CPU -->
<maxResources>2048mb,2vcores</maxResources>
<!-- 调度策略 -->
<schedulingPolicy>fair</schedulingPolicy>
<weight>1.0</weight>
<aclSubmitApps>hadoop</aclSubmitApps>
<aclAdministerApps>hadoop</aclAdministerApps>
</queue>
<!-- 配置hadoop用户队列 -->
<queue name="hadoop">
<!-- 允许最大App运行数 -->
<maxRunningApps>10</maxRunningApps>
<!-- 分配最小内存和CPU -->
<minResources>1024mb,1vcores</minResources>
<!-- 分配最大内存和CPU -->
<maxResources>3072,3vcores</maxResources>
<!-- 调度策略 -->
<schedulingPolicy>fair</schedulingPolicy>
<weight>1.0</weight>
<aclSubmitApps>hadoop</aclSubmitApps>
<aclAdministerApps>hadoop</aclAdministerApps>
</queue>
<!-- 配置queue_1024_01 用户队列 -->
<queue name="hadoop">
<!-- 允许最大App运行数 -->
<maxRunningApps>10</maxRunningApps>
<!-- 分配最小内存和CPU -->
<minResources>1000mb,1vcores</minResources>
<!-- 分配最大内存和CPU -->
<maxResources>2048,2vcores</maxResources>
<!-- 调度策略 -->
<schedulingPolicy>fair</schedulingPolicy>
<weight>1.0</weight>
<aclSubmitApps>hadoop,user1024</aclSubmitApps>
<aclAdministerApps>hadoop,user1024</aclAdministerApps>
</queue>
</queue>
<fairSharePreemptionTimeout>600000</fairSharePreemptionTimeout>
<defaultMinSharePreemptionTimeout>600000</defaultMinSharePreemptionTimeout>
</allocations>
hadoop-env.sh
export JAVA_HOME=/data/soft/jdk1.8.0_211
yarn-env.sh
export JAVA_HOME=/data/soft/jdk1.8.0_211
workers
追加
dn1
dn2
dn3
启动JournalNode
2选1
- 任意NameNode上启动JournalNode进程
hadoop-daemons.sh start journalnode
- 在每个DataNode分别启动JournalNode进程
hadoop-daemons.sh start journalnode
格式化NameNode
hdfs namenode -format
向zookeeper注册ZNode
hdfs zkfc -formatZK
启动分布式文件系统
start-dfs.sh
启动YARN进程
start-yarn.sh
jps
: 列出本机java进程
nns同步nna元数据
[hadoop@nns ~]$ hdfs namenode -bootstrapStandby
[hadoop@nns ~]$ hadoop-daemon.sh start namenode
[hadoop@nns ~]$ yarn-daemon.sh start resourcemanager
网址查看集群信息
hadoop:http://nna:50070/
yarn:http://nna:8188/
上传
[hadoop@nna~]$ hdfs dfs -put helloworld.txt /home/hdfs/test
查看
[hadoop@nna~]$ hdfs dfs -cat /home/hdfs/test/helloworld.txt
下载
[hadoop@nna~]$ hdfs dfs -get /home/hdfs/test/helloworld.txt ./
删除
[hadoop@nna~]$ hdfs dfs -rm -r /home/hdfs/test/helloworld.txt
[hadoop@nna~]$ hdfs dfs -rm -r /home/hdfs/test
集群运行状态
[hadoop@nna~]$ hdfs dfsadrnin -report
[hadoop@nna~]$ hdfs version