hadoop(二):配置与简单使用

path

$hadoop_HOME/etc/hadoop

core-site.xml

<configuration>
    <!-- HDFS(分布式文件储存系统)的NameService,是NameNode的URL -->
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://cluster1</value>
    </property>


    <!-- 用于序列文件缓冲区的大小,决定读写操作期间缓存了多少数据 -->
    <property>
        <name>io.file.buffer.size</name>
        <value>131072</value>
    </property>


    <!-- hadoop临时文件目录 -->
    <property>
        <name>hadoop.tmp.dir</name>
        <value>/data/tmp/hadoop</value>
    </property>


    <!-- 指定可以在任何ip访问 -->
    <property>
        <name>hadoop.proxyuser.hadoop.hosts</name>
        <value>*</value>
    </property>


    <!-- 指定所有账号可以访问 -->
    <property>
        <name>hadoop.proxyuser.hadoop.groups</name>
        <value>*</value>
    </property>


    <!-- 指定zookeeper地址及端口 -->
    <property>
        <name>ha.zookeeper.quorum</name>
        <value>dn1:2181,dn2:2181,dn3:2181</value>
    </property>


    <!-- text 
    <property>
        <name></name>
        <value></value>
    </property>
    -->
</configuration>

hdfs-site.xml

<configuration>
    <!-- 指定HDFS的NameService,与core-site.xml保持一致 -->
    <property>
        <name>dfs.nameservices</name>
        <value>cluster1</value>
    </property>


    <!-- 设置cluster1下面的NameNode -->
    <property>
        <name>dfs.ha.namenodes.cluster1</name>
        <value>nna,nns</value>
    </property>


    <!-- RPC通讯地址 -->
    <property>
        <name>dfs.namenode.rpc-address.cluster1.nna</name>
        <value>nna:9000</value>
    </property>
    <property>
        <name>dfs.namenode.rpc-address.cluster1.nns</name>
        <value>nns:9000</value>
    </property>


    <!-- 指定HTTP通讯地址 -->
    <property>
        <name>dfs.namenode.http-address.cluster1.nna</name>
        <value>nna:50070</value>
    </property>
    <property>
        <name>dfs.namenode.http-address.cluster1.nns</name>
        <value>nns:50070</value>
    </property>


    <!-- 指定NameNode的元数据在JournamNode上的存放位置 -->
    <property>
        <name>dfs.namenode.shared.edits.dir</name>
        <value>
        	qjournal://dn1:8485;dn2:8485;dn3:8485/cluster1
        </value>
    </property>
    <property>
        <name>dfs.journalnode.edits.dir</name>
        <value>/data/tmp/journal</value>
    </property>


    <!-- 配置失败自动切换实现方式 -->
    <property>
        <name>dfs.client.failover.proxy.provider.cluster1</name>
        <value>
        	org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider
        </value>
    </property>


    <!-- 配置隔离机制 -->
    <property>
        <name>dfs.ha.fencing.methods</name>
        <value>sshfence</value>
    </property>
    

    <!-- 使用隔离机制是需要ssh免密 -->
    <property>
        <name>dfs.ha.fencing.ssh.private-key-files</name>
        <value>/home/hadoop/.ssh/id_rea</value>
    </property>


    <!-- 指定支持高可用自动切换机制 -->
    <property>
        <name>dfs.ha.automatic-failover.enabled</name>
        <value>true</value>
    </property>


    <!-- NameNode名称空间存储地址 -->
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>/data/tmp/dfs/name</value>
    </property>


    <!-- DataNode数据存储地址 -->
    <property>
        <name>dfs.datanode.data.dir</name>
        <value>/data/tmp/dfs/data</value>
    </property>


    <!-- 数据冗余分数 -->
    <property>
        <name>dfs.replication</name>
        <value>3</value>
    </property>


    <!-- 是否可通过web访问的HDFS目录 -->
    <property>
        <name>dfs.webhdfs.enabled</name>
        <value>true</value>
    </property>
    

    <!-- 保证数据恢复,通过0.0.0.0保证,则可以兼顾内外网访问 -->
    <property>
        <name>dfs.journalnode.rpc-address</name>
        <value>0.0.0.0:8485</value>
    </property>
    <property>
        <name>dfs.journalnode.http-address</name>
        <value>0.0.0.0:8480</value>
    </property>


    <!-- 通过ZKFailoverController来实现自动故障切换 -->
    <property>
        <name>ha.zookeeper.quorum</name>
        <value>dn1:2181,dn2:2181,dn3:2181</value>
    </property>


    <!-- text 
    <property>
        <name></name>
        <value></value>
    </property>
    -->
</configuration>

mapred-site.xml

<configuration>
    <!-- 计算任务托管的资源管理名称 -->
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>


    <!-- 配置MapReduce JobHistory Server 地址,默认端口10020 -->
    <property>
        <name>mapreduce.jobhistory.address</name>
        <value>0.0.0.0:10020</value>
    </property>


    <!-- 配置MapReduce JobHistory Server Web 地址,默认端口19888 -->
    <property>
        <name>mapreduce.jobhistory.webapp.address</name>
        <value>0.0.0.0:19888</value>
    </property>


    <!-- text 
    <property>
        <name></name>
        <value></value>
    </property>
    -->
</configuration>

yarn-site.xml

<configuration>
    <!-- RM(Resource Manager)失联后重新连接的时间 -->
    <property>
        <name>yarn.resourcemanager.connect.retry-interval.ms</name>
        <value>2000</value>
    </property>


    <!-- 开启RM HA,默认为false -->
    <property>
        <name>yarn.resourcemanager.ha.enabled</name>
        <value>true</value>
    </property>


    <!-- 配置RM -->
    <property>
        <name>yarn.resourcemanager.ha.rm-ids</name>
        <value>rm1,rm2</value>
    </property>
    <property>
        <name>ha.zookeeper.quorum</name>
        <value>dn1:2181,dn2:2181,dn3:2181</value>
    </property>


    <!-- 开启故障自动切换 -->
    <property>
        <name>yarn.resourcemanager.ha.automatic-failover.enabled</name>
        <value>true</value>
    </property>


    <!-- rm1配置开始 -->
    <!-- 配置Resource Manager 主机别名rm1 角色为NameNode Active -->
    <property>
        <name>yarn.resourcemanager.hostname.rm1</name>
        <value>nna</value>
    </property>


    <!-- 配置Resource Manager 主机别名rm2 角色为NameNode Active -->
    <property>
        <name>yarn.resourcemanager.hostname.rm2</name>
        <value>nns</value>
    </property>


    <!-- nna配置rm1,nns配置rm2 -->
    <property>
        <name>yarn.resourcemanager.ha.id</name>
        <value>rm1</value>
    </property>


    <!-- 开启自动恢复功能 -->
    <property>
        <name>yarn.resourcemanager.recovery.enabled</name>
        <value>true</value>
    </property>


    <!-- 配置与zookeeper的连接地址 -->
    <property>
        <name>yarn.resourcemanager.zk-state-store.address</name>
        <value>dn1:2181,dn2:2181,dn3:2181</value>
    </property>


    <!-- 用于持久化RM实现,基于zookeeper实现 -->
    <property>
        <name>yarn.resourcemanager.store.class</name>
        <value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
    </property>


    <!-- zookeeper地址用于RM实现状态存储,以及HA的设置 -->
    <property>
        <name>yarn.resourcemanager.zk-address</name>
        <value>dn1:2181,dn2:2181,dn3:2181</value>
    </property>


    <!-- 集群ID标识 -->
    <property>
        <name>yarn.resourcemanager.cluster-id</name>
        <value>cluster1-yarn</value>
    </property>


    <!-- schelduler失联等待连接时间 -->
    <property>
        <name>yarn.app.mapreduce.am.scheduler.connection.wait.interval-ms</name>
        <value>5000</value>
    </property>

    
    <!-- 不同的RM主机配置=================================== -->
    <!-- 配置rm1,其应用访问管理接口 -->
    <property>
        <name>yarn.resourcemanager.address.rm1</name>
        <value>nna:8132</value>
    </property>


    <!-- 调度接口地址 -->
    <property>
        <name>yarn.resourcemanager.scheduler.address</name>
        <value>nna:8130</value>
    </property>


    <!-- RM的Web访问地址 -->
    <property>
        <name>yarn.resourcemanager.webapp.address.rm1</name>
        <value>nna:8188</value>
    </property>
    <property>
        <name>yarn.resourcemanager.resource-tarcker.address.rm1</name>
        <value>nna:8131</value>
    </property>
    

    <!-- RM管理员接口地址 -->
    <property>
        <name>yarn.resourcemanager.admin.address.rm1</name>
        <value>nna:8033</value>
    </property>
    <property>
        <name>yarn.resourcemanager.admin.ha.address.rm1</name>
        <value>nna:23142</value>
    </property>
    <!-- 不同的RM主机配置=================================== -->
    <!-- rm1配置结束 -->
    
    
    <!-- rm2配置开始 -->
    <!-- 不同的RM主机配置=================================== -->
    <!-- 配置rm1,其应用访问管理接口 -->
    <property>
        <name>yarn.resourcemanager.address.rm2</name>
        <value>nns:8132</value>
    </property>


    <!-- 调度接口地址 -->
    <property>
        <name>yarn.resourcemanager.scheduler.address</name>
        <value>nns:8130</value>
    </property>


    <!-- RM的Web访问地址 -->
    <property>
        <name>yarn.resourcemanager.webapp.address.rm2</name>
        <value>nns:8188</value>
    </property>
    <property>
        <name>yarn.resourcemanager.resource-tarcker.address.rm2</name>
        <value>nns:8131</value>
    </property>
    

    <!-- RM管理员接口地址 -->
    <property>
        <name>yarn.resourcemanager.admin.address.rm2</name>
        <value>nns:8033</value>
    </property>
    <property>
        <name>yarn.resourcemanager.admin.ha.address.rm2</name>
        <value>nns:23142</value>
    </property>
    <!-- 不同的RM主机配置=================================== -->
    <!-- rm2配置结束 -->

    <!-- NM(NodeManager)的附属服务,需要设置成mapreduce_shuffle才能运行MapReduce任务 -->
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>


    <!-- 配置shuffle处理类 -->
    <property>
        <name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
        <value>org.apache.hadoop.mapred.ShuffleHandler</value>
    </property>


    <!-- NM本地文件路径 -->
    <property>
        <name>yarn.nodemanager.local-dirs</name>
        <value>/data/tmp/yarn/local</value>
    </property>


    <!-- NM日志存放地址 -->
    <property>
        <name>yarn.nodemanager.log-dirs</name>
        <value>/data/tmp/log/yarn</value>
    </property>


    <!-- ShuffleHandler 运行服务端口,用于 Map 结果输出到请求 Reducer -->
    <property>
        <name>mapreduce.shuffle.port</name>
        <value>23080</value>
    </property>


    <!-- 故障处理类 -->
    <property>
        <name>yarη.client.failover.proxy.provider</name>
        <value>org.apache.hadoop.yarn.client.ConfigureRMFailoverProxyProvider</value>
    </property>


    <!-- 故障自动转移的zookeeper路径地址 -->
    <property>
        <name>yarn.resourcemanager.ha.automatic-failover.zk-base-path</name>
        <value>yarn-leader-election</value>
    </property>


    <!-- 查看任务调度进度,在nns节点上需要将访问地址修改为http://nns:9001 -->
    <property>
        <name>mapreduce.jobtracker.address</name>
        <value>http://nna:9001</value>
    </property>


    <!-- 启动聚合操作日志 -->
    <property>
        <name>yarn.log-aggregation-enable</name>
        <value>true</value>
    </property>


    <!-- 指定日志在HDFS上的路径 -->
    <property>
        <name>yean.nodemanager.remote-app-log-dir</name>
        <value>/data/tmp/dfs/logs</value>
    </property>
    <property>
        <name>yean.nodemanager.remote-app-log-dir-suffix</name>
        <value>logs</value>
    </property>


    <!-- 聚合后的日志在HDFS上保存多长时间,单位为秒 -->
    <property>
        <name>yarn.log-aggregation.retain-seconds</name>
        <value>259200</value>
    </property>


    <!-- 删除任务的HDFS上执行的间隔,执行时候将满足掉件的日志删除 -->
    <property>
        <name>yarn.log-aggregation.retain-check-interval-seconds</name>
        <value>3600</value>
    </property>


    <!-- RM浏览器代理端口 -->
    <property>
        <name>yarn.web-proxy.address</name>
        <value>nna:8090</value>
    </property>


    <!-- 配置Fair调度策略 -->
    <property>
        <description>
            CLASSPATH for YARN applications . A comma-separated list of CLASSPATH entries. When th value is empty , the following default 
            CLASSPATH for YARN applications would be used . 
            For Linux: 
            $HADOOP_CONF DIR, 
            $HADOOP_COMMON_HOME/share/hadoop/common/*,
            $HADOOP COMMON HOME/share/hadoop/common/l/*,
            $HADOOP_HDFS_HOME/share/hadoop/hdfs*,
            $HADOOP_HDFS_HOME/share/hadoop/hdfs/lib/*,
            $HADOOP_YARN_HOME/share/hadoop/yarn/*, 
            $HADOOP_YARN_HOME/share/hadoop/yarn/lib/*
        </description>
        <name>yarn.application.classpath</name>
        <value>
            /data/soft/new/hadoop/share/hadoop/common/*,
            /data/soft/new/hadoop/share/hadoop/common/lib/*,
            /data/soft/new/hadoop/share/hadoop/hdfs/*,
            /data/soft/new/hadoop/share/hadoop/hdfs/lib/*,
            /data/soft/new/hadoop/share/hadoop/yarn/*,
            /data/soft/new/hadoop/share/hadoop/yarn/lib/*
        </value>
    </property>


    <!-- 配置 Fair 调度策略指定类 -->
    <property>
        <name>yarn.resourcemanager.scheduler.class</name>
        <value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
    </property>


    <!-- 启动RM系统监控 -->
    <property>
        <name>yarn.resourcemanager.system-metrics-publisher.enabled</name>
        <value>true</value>
    </property>


    <!-- 指定调度策略配置文件 -->
    <property>
        <name>yarn.scheduler.fair.allocation.file</name>
        <value>/data/soft/hadoop-3.2.0/etc/hadoop/fair-scheduler.xml</value>
    </property>


    <!-- 每个NodeManager节点分贝没存大小 -->
    <property>
        <name>yarn.nodemanager.resource.memory-mb</name>
        <value>1024</value>
    </property>


    <!-- 每个NodeManager节点分配的cpu核数 -->
    <property>
        <name>yarn.nodemanager.resource.cpu-vcores</name>
        <value>1</value>
    </property>


    <!-- 物理内存和虚拟内存比率 -->
    <property>
        <name>yarn.nodemanager.vmem-pmem-ratio</name>
        <value>4.2</value>
    </property>


    <!-- text 
    <property>
        <name></name>
        <value></value>
    </property>
    -->
</configuration>

fair-scheduler.xml

<?xml version="1.0"?>
<allocations>
    <queue name="root">
        <!-- 默认队列 -->
        <queue name="default">
            <!-- 允许最大App运行数 -->
            <maxRunningApps>10</maxRunningApps>
            <!-- 分配最小内存和CPU -->
            <minResources>1024mb,1vcores</minResources>
            <!-- 分配最大内存和CPU -->
            <maxResources>2048mb,2vcores</maxResources>
            <!-- 调度策略 -->
            <schedulingPolicy>fair</schedulingPolicy>
            <weight>1.0</weight>
            <aclSubmitApps>hadoop</aclSubmitApps>
            <aclAdministerApps>hadoop</aclAdministerApps>
        </queue>

        <!-- 配置hadoop用户队列 -->
        <queue name="hadoop">
            <!-- 允许最大App运行数 -->
            <maxRunningApps>10</maxRunningApps>
            <!-- 分配最小内存和CPU -->
            <minResources>1024mb,1vcores</minResources>
            <!-- 分配最大内存和CPU -->
            <maxResources>3072,3vcores</maxResources>
            <!-- 调度策略 -->
            <schedulingPolicy>fair</schedulingPolicy>
            <weight>1.0</weight>
            <aclSubmitApps>hadoop</aclSubmitApps>
            <aclAdministerApps>hadoop</aclAdministerApps>
        </queue>

        <!-- 配置queue_1024_01 用户队列 -->
        <queue name="hadoop">
            <!-- 允许最大App运行数 -->
            <maxRunningApps>10</maxRunningApps>
            <!-- 分配最小内存和CPU -->
            <minResources>1000mb,1vcores</minResources>
            <!-- 分配最大内存和CPU -->
            <maxResources>2048,2vcores</maxResources>
            <!-- 调度策略 -->
            <schedulingPolicy>fair</schedulingPolicy>
            <weight>1.0</weight>
            <aclSubmitApps>hadoop,user1024</aclSubmitApps>
            <aclAdministerApps>hadoop,user1024</aclAdministerApps>
        </queue>
    </queue>

    <fairSharePreemptionTimeout>600000</fairSharePreemptionTimeout>
    <defaultMinSharePreemptionTimeout>600000</defaultMinSharePreemptionTimeout>
</allocations>

hadoop-env.sh

export JAVA_HOME=/data/soft/jdk1.8.0_211

yarn-env.sh

export JAVA_HOME=/data/soft/jdk1.8.0_211

workers

追加

dn1
dn2
dn3

启动JournalNode

2选1

  • 任意NameNode上启动JournalNode进程
    hadoop-daemons.sh start journalnode
  • 在每个DataNode分别启动JournalNode进程
    hadoop-daemons.sh start journalnode

格式化NameNode

hdfs namenode -format

向zookeeper注册ZNode

hdfs zkfc -formatZK

启动分布式文件系统

start-dfs.sh

启动YARN进程

start-yarn.sh
jps: 列出本机java进程

nns同步nna元数据

[hadoop@nns ~]$ hdfs namenode -bootstrapStandby
[hadoop@nns ~]$ hadoop-daemon.sh start namenode
[hadoop@nns ~]$ yarn-daemon.sh start resourcemanager

网址查看集群信息

hadoop:http://nna:50070/
yarn:http://nna:8188/

上传

[hadoop@nna~]$ hdfs dfs -put helloworld.txt /home/hdfs/test

查看

[hadoop@nna~]$ hdfs dfs -cat /home/hdfs/test/helloworld.txt

下载

[hadoop@nna~]$ hdfs dfs -get /home/hdfs/test/helloworld.txt ./

删除

[hadoop@nna~]$ hdfs dfs -rm -r /home/hdfs/test/helloworld.txt
[hadoop@nna~]$ hdfs dfs -rm -r /home/hdfs/test

集群运行状态

[hadoop@nna~]$ hdfs dfsadrnin -report
[hadoop@nna~]$ hdfs version

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值