MapReduce学习03-client端开发

10 篇文章 0 订阅
7 篇文章 0 订阅


本节的目标是通过java的client端完成需求:对一段目标文本里统计每个单词出现的次数。

POM准备

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>hadoop</groupId>
    <artifactId>hadooptest</artifactId>
    <version>1.0-SNAPSHOT</version>

  <dependencies>

      <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
      <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-client</artifactId>
          <version>2.6.5</version>
      </dependency>

      <dependency>
          <groupId>junit</groupId>
          <artifactId>junit</artifactId>
          <version>4.12</version>
          <scope>compile</scope>
      </dependency>
  </dependencies>


</project>

配置文件准备

core-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
	<property>
        	<name>fs.defaultFS</name>
        	<value>hdfs://myCluster</value>
    	</property>
	<property>
        	<name>ha.zookeeper.quorum</name>
        	<value>dream02:2181,dream03:2181,dream04:2181</value>
    	</property>
      
</configuration>

hdfs-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
    <property>
        <name>dfs.replication</name>
        <value>2</value>
    </property>
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>/var/bigdata/hadoop/ha/dfs/name</value>
    </property>
    <property>
        <name>dfs.datanode.data.dir</name>
        <value>/var/bigdata/hadoop/ha/dfs/data</value>
    </property>
    <!-- 以下是  一对多,逻辑到物理节点的映射 -->
    <property>
        <name>dfs.nameservices</name>
        <value>myCluster</value>
    </property>
    <property>
        <name>dfs.ha.namenodes.myCluster</name>
        <value>nn1,nn2</value>
    </property>
    <property>
        <name>dfs.namenode.rpc-address.myCluster.nn1</name>
        <value>dream01:8020</value>
    </property>
    <property>
        <name>dfs.namenode.rpc-address.myCluster.nn2</name>
        <value>dream02:8020</value>
    </property>
    <property>
        <name>dfs.namenode.http-address.myCluster.nn1</name>
        <value>dream01:50070</value>
    </property>
    <property>
        <name>dfs.namenode.http-address.myCluster.nn2</name>
        <value>dream02:50070</value>
    </property>
    <!-- 以下是JN在哪里启动,数据存那个磁盘 -->
    <property>
        <name>dfs.namenode.shared.edits.dir</name>
        <value>qjournal://dream01:8485;dream02:8485;dream03:8485/myClusterE</value>
    </property>
    <property>
        <name>dfs.journalnode.edits.dir</name>
        <value>/var/bigdata/hadoop/ha/dfs/jn</value>
    </property>
    <!-- #HA角色切换的代理类和实现方法,我们用的ssh免密 -->
    <property>
        <name>dfs.client.failover.proxy.provider.myCluster</name>
        <value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
    </property>
    <property>
        <name>dfs.ha.fencing.methods</name>
        <value>sshfence</value>
    </property>
    <property>
        <name>dfs.ha.fencing.ssh.private-key-files</name>
        <value>/home/god/.ssh/id_dsa</value>
    </property>

    <!-- 开启自动化: 启动zkfc -->
    <property>
        <name>dfs.ha.automatic-failover.enabled</name>
        <value>true</value>
    </property>
</configuration>

mapred-site.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>

</configuration>

yarn-site.xml

<?xml version="1.0"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->
<configuration>

<!-- Site specific YARN configuration properties -->
  <property>

		<!--协调连接 -->

			<name>yarn.nodemanager.aux-services</name>
			<value>mapreduce_shuffle</value>
		    </property>
		<!--开启高可用 -->
		 <property>
		   <name>yarn.resourcemanager.ha.enabled</name>
		   <value>true</value>
		 </property>
		 <property>
		   <name>yarn.resourcemanager.zk-address</name>
		   <value>dream02:2181,dream03:2181,dream04:2181</value>
		 </property>

		 <property>
		   <name>yarn.resourcemanager.cluster-id</name>
		   <value>my_rm</value>
		 </property>

		 <property>
		   <name>yarn.resourcemanager.ha.rm-ids</name>
		   <value>rm1,rm2</value>
		 </property>
		 <property>
		   <name>yarn.resourcemanager.hostname.rm1</name>
		   <value>dream03</value>
		 </property>
		 <property>
		   <name>yarn.resourcemanager.hostname.rm2</name>
		   <value>dream04</value>
		 </property>
</configuration>

代码实现

编写自定义的Mapper类和Reduce类

package mapreduce;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.StringTokenizer;

public class MyJob {
    public static class MyMapper extends Mapper<Object, Text, Text, IntWritable> {
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();

        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            StringTokenizer itr = new StringTokenizer(value.toString());
            while (itr.hasMoreTokens()) {
                word.set(itr.nextToken());
                context.write(word, one);
            }
        }
    }

    public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable result = new IntWritable();

        public void reduce(Text key, Iterable<IntWritable> values,
                           Context context) throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable val : values) {
                sum += val.get();
            }
            result.set(sum);
            context.write(key, result);
        }
    }
}

接下来就可以开始编写cli端的主运行代码了

package mapreduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class MapReduceTest2 {
    public static void main(String[] args) throws Exception {
        Configuration configuration = new Configuration(true);
        
        /*
           这个类可以对args传入参数进行区分
           例如args为 -D mapreduce.job.reduces=2 /data/wc/input /data/wc/output4
           则会将-D的参数“mapreduce.job.reduces=2”设置进conf里,
           而“/data/wc/input”和“ /data/wc/output4”则才视为程序自定义参数
           可以使用getRemainingArgs()获得自定义参数。
         */
        GenericOptionsParser genericOptionsParser = new GenericOptionsParser(configuration,args);
        String[] remainingArgs = genericOptionsParser.getRemainingArgs();
        
        //拿到一个任务实例
        Job job = Job.getInstance(configuration);
        
        //设置任务的main类,因为现在所编写的代码将来会被打包到某台集群,通过反射去得到这个执行类
        job.setJarByClass(MapReduceTest2.class);
        
        // 设置任务名称
        job.setJobName("peng_job");

        //要计算的源文件读取的位置
        Path infile = new Path(remainingArgs[0]);
        TextInputFormat.addInputPath(job,infile);
        //计算完成后放到的位置
        Path outfile = new Path(remainingArgs[1]);
        if(outfile.getFileSystem(configuration).exists(outfile))outfile.getFileSystem(configuration).delete(outfile,true);
        TextOutputFormat.setOutputPath(job,outfile);
        //设置本次计算使用的map类
        job.setMapperClass(MyJob.MyMapper.class);
        //当reduce从map拿数据时,涉及到序列化的问题,java作为一名强类型语言,需要明确指定为mapreduce规定好的类型(这个类型包装了序列化)
        job.setOutputKeyClass(Text.class);//Text 等同于String的包装类
        job.setOutputValueClass(IntWritable.class);//Int等同于int的包装类
        //设置本次计算使用的Reducer类
        job.setReducerClass(MyJob.MyReducer.class);
        // 提交这个任务等待计算
        job.waitForCompletion(true);
    }
}

任务的提交方式

1,通过jar包:
开发好之后打包成jar 上传到集群中的某一个节点, 在命令行通过hadoop jar ooxx.jar ooxx in out这种方式提交。

2.(非hadoop jar)的集群方式,通过IDE运行提交。

1.on yarn
mapreduce.framework.name -> yarn //决定了集群运行
configuration.set(“mapreduce.app-submission.cross-platform”,“true”);
job.setJar(“C:\Users\Administrator\IdeaProjects\msbhadoop\target\hadoop-hdfs-1.0-0.1.jar”);

2.local,单机 自测
mapreduce.framework.name -> local
configuration.set(“mapreduce.app-submission.cross-platform”,“true”); //windows上必须配
1,在win的系统中部署win版hadoop例:C:\usr\hadoop-2.6.5\hadoop-2.6.5
2,将hadoop.dll 复制到 c:\windwos\system32
3,设置环境变量:HADOOP_HOME C:\usr\hadoop-2.6.5\hadoop-2.6.5

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值