1、顺序组合式
顺序组合式就是按照指定顺序执行任务如:mapreduce1 --> mapreduce2 --> mapreduce3
即:mapreduce1的输出是mapreduce2的输入,mapreduce2的输出式mapreduce3的输入
代码片段如下:
- String inPath1 = "hdfs://hadoop0:9000/user/root/3D/";
- String outPath1 = "hdfs://hadoop0:9000/user/root/3DZout/";
- String outPath2 = "hdfs://hadoop0:9000/user/root/3DZout2/";
- String outPath3 = "hdfs://hadoop0:9000/user/root/3DZout3/";
- // job1配置
- Job job1 = Job.getInstance(conf);
- job1.setJarByClass(Mode.class);
- job1.setMapperClass(Map1.class);
- job1.setReducerClass(Reduce1.class);
- job1.setMapOutputKeyClass(Text.class);
- job1.setMapOutputValueClass(IntWritable.class);
- job1.setOutputKeyClass(Text.class);
- job1.setOutputValueClass(IntWritable.class);
- FileInputFormat.addInputPath(job1, new Path(inPath1));
- FileOutputFormat.setOutputPath(job1, new Path(outPath1));
- job1.waitForCompletion(true);
- // job2配置
- Job job2 = Job.getInstance(conf);
- job2.setJarByClass(Mode.class);
- job2.setMapperClass(Map2.class);
- job2.setReducerClass(Reduce2.class);
- job2.setMapOutputKeyClass(Text.class);
- job2.setMapOutputValueClass(IntWritable.class);
- job2.setOutputKeyClass(Text.class);
- job2.setOutputValueClass(IntWritable.class);
- FileInputFormat.addInputPath(job2, new Path(inPath1));
- FileOutputFormat.setOutputPath(job2, new Path(outPath2));
- job2.waitForCompletion(true);
- // job3配置
- Job job3 = Job.getInstance(conf);
- job3.setJarByClass(Mode.class);
- job3.setMapperClass(Map3.class);
- job3.setReducerClass(Reduce3.class);
- job3.setMapOutputKeyClass(Text.class);
- job3.setMapOutputValueClass(IntWritable.class);
- job3.setOutputKeyClass(Text.class);
- job3.setOutputValueClass(IntWritable.class);
- FileInputFormat.addInputPath(job3, new Path(outPath2));
- FileOutputFormat.setOutputPath(job3, new Path(outPath3));
- job3.waitForCompletion(true);
子任务作业配置代码运行后,将按照顺序逐个执行每个子任务作业。由于后一个子任务需要使用前一个子任务的输出数据,因此,每一个子任务
都需要等前一个子任务执行执行完毕后才允许执行,这是通过job.waitForCompletion(true)方法加以保证的。
2、迭代组合式
迭代也可以理解为for循环或while循环,当满足某些条件时,循环结束
mapreduce的迭代算法正在研究中,后续提供完整源码....
代码如下:
3、复杂的依赖组合式
处理复杂的要求的时候,有时候一个mapreduce程序完成不了,往往需要多个mapreduce程序 这个时候就牵扯到各个任务之间的依赖关系,
所谓依赖就是一个M/R job的处理结果是另外一个M/R的输入,以此类推,
这里的顺序是 job1 和 job2 单独执行, job3依赖job1和job2执行后的结果
代码如下:
- package com.hadoop.mapreduce;
- import java.io.IOException;
- import java.util.StringTokenizer;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
- import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.util.GenericOptionsParser;
- public class Mode {
- // 第一个Job
- public static class Map1 extends Mapper<Object, Text, Text, IntWritable>{
- Text word = new Text();
- @Override
- protected void map(Object key, Text value,Context context)
- throws IOException, InterruptedException {
- StringTokenizer st = new StringTokenizer(value.toString());
- while(st.hasMoreTokens()){
- word.set(st.nextToken());
- context.write(word, new IntWritable(1));
- }
- }
- }
- public static class Reduce1 extends Reducer<Text, IntWritable, Text, IntWritable>{
- IntWritable result = new IntWritable();
- @Override
- protected void reduce(Text key, Iterable<IntWritable> values,Context context)
- throws IOException, InterruptedException {
- int sum = 0;
- for(IntWritable val : values){
- sum += val.get();
- }
- result.set(sum);
- context.write(key, result);
- }
- }
- // 第二个Job
- public static class Map2 extends Mapper<Object, Text, Text, IntWritable>{
- Text word = new Text();
- @Override
- protected void map(Object key, Text value,Context context)
- throws IOException, InterruptedException {
- StringTokenizer st = new StringTokenizer(value.toString());
- while(st.hasMoreTokens()){
- word.set(st.nextToken());
- context.write(word, new IntWritable(1));
- }
- }
- }
- public static class Reduce2 extends Reducer<Text, IntWritable, Text, IntWritable>{
- IntWritable result = new IntWritable();
- @Override
- protected void reduce(Text key, Iterable<IntWritable> values,Context context)
- throws IOException, InterruptedException {
- int sum = 0;
- for(IntWritable val : values){
- sum += val.get();
- }
- result.set(sum);
- context.write(key, result);
- }
- }
- // 第三个Job
- public static class Map3 extends Mapper<Object, Text, Text, IntWritable>{
- Text word = new Text();
- @Override
- protected void map(Object key, Text value,Context context)
- throws IOException, InterruptedException {
- StringTokenizer st = new StringTokenizer(value.toString());
- while(st.hasMoreTokens()){
- word.set(st.nextToken());
- context.write(word, new IntWritable(1));
- }
- }
- }
- public static class Reduce3 extends Reducer<Text, IntWritable, Text, IntWritable>{
- IntWritable result = new IntWritable();
- @Override
- protected void reduce(Text key, Iterable<IntWritable> values,Context context)
- throws IOException, InterruptedException {
- int sum = 0;
- for(IntWritable val : values){
- sum += val.get();
- }
- result.set(sum);
- context.write(key, result);
- }
- }
- public static void main(String[] args) throws IOException{
- String inPath1 = "hdfs://hadoop0:9000/user/root/3D/";
- String outPath1 = "hdfs://hadoop0:9000/user/root/3DZout/";
- String outPath2 = "hdfs://hadoop0:9000/user/root/3DZout2/";
- String outPath3 = "hdfs://hadoop0:9000/user/root/3DZout3/";
- String[] inOut = {inPath1, outPath1};
- Configuration conf = new Configuration();
- String[] otherArgs = new GenericOptionsParser(conf, inOut).getRemainingArgs();
- if (otherArgs.length < 2) {
- System.err.println("Usage: wordcount <in> [<in>...] <out>");
- System.exit(2);
- }
- // 判断输出路径是否存在,如存在先删除
- FileSystem hdfs = FileSystem.get(conf);
- Path findFile = new Path(outPath1);
- boolean isExists = hdfs.exists(findFile);
- if(isExists){
- hdfs.delete(findFile, true);
- }
- if(hdfs.exists(new Path(outPath2))){
- hdfs.delete(new Path(outPath2), true);
- }
- if(hdfs.exists(new Path(outPath3))){
- hdfs.delete(new Path(outPath3), true);
- }
- // job1配置
- Job job1 = Job.getInstance(conf);
- job1.setJarByClass(Mode.class);
- job1.setMapperClass(Map1.class);
- job1.setReducerClass(Reduce1.class);
- job1.setMapOutputKeyClass(Text.class);
- job1.setMapOutputValueClass(IntWritable.class);
- job1.setOutputKeyClass(Text.class);
- job1.setOutputValueClass(IntWritable.class);
- FileInputFormat.addInputPath(job1, new Path(inPath1));
- FileOutputFormat.setOutputPath(job1, new Path(outPath1));
- // 将job1加入控制容器
- ControlledJob ctrljob1 = new ControlledJob(conf);
- ctrljob1.setJob(job1);
- // job2配置
- Job job2 = Job.getInstance(conf);
- job2.setJarByClass(Mode.class);
- job2.setMapperClass(Map2.class);
- job2.setReducerClass(Reduce2.class);
- job2.setMapOutputKeyClass(Text.class);
- job2.setMapOutputValueClass(IntWritable.class);
- job2.setOutputKeyClass(Text.class);
- job2.setOutputValueClass(IntWritable.class);
- FileInputFormat.addInputPath(job2, new Path(inPath1));
- FileOutputFormat.setOutputPath(job2, new Path(outPath2));
- // 将job2加入控制容器
- ControlledJob ctrljob2 = new ControlledJob(conf);
- ctrljob2.setJob(job2);
- // job3配置
- Job job3 = Job.getInstance(conf);
- job3.setJarByClass(Mode.class);
- job3.setMapperClass(Map3.class);
- job3.setReducerClass(Reduce3.class);
- job3.setMapOutputKeyClass(Text.class);
- job3.setMapOutputValueClass(IntWritable.class);
- job3.setOutputKeyClass(Text.class);
- job3.setOutputValueClass(IntWritable.class);
- FileInputFormat.addInputPath(job3, new Path(outPath2));
- FileOutputFormat.setOutputPath(job3, new Path(outPath3));
- ControlledJob ctrljob3 = new ControlledJob(conf);
- // 设置job3依赖job1和job2
- ctrljob3.addDependingJob(ctrljob1);
- ctrljob3.addDependingJob(ctrljob2);
- ctrljob3.setJob(job3);
- // 主控制器
- JobControl jobCtrl = new JobControl("myctrl");
- jobCtrl.addJob(ctrljob1);
- jobCtrl.addJob(ctrljob2);
- jobCtrl.addJob(ctrljob3);
- // 在启动线程,记住一定要有这个
- Thread t = new Thread(jobCtrl);
- t.start();
- while(true){
- // 如果作业全部完成,就打印成功作业的信息
- if(jobCtrl.allFinished()){
- System.out.println(jobCtrl.getSuccessfulJobList());
- jobCtrl.stop();
- break;
- }
- }
- }
- }
3、链式组合式
所谓连式MapReduce就是用多个Mapper处理任务,最后用一个Reducer输出结果,注意和迭代式和组合式MapReduce的不同之处
一个MapReduce作业可能会有一些前处理和后处理步骤,将这些前后处理步骤以单独的MapReduce任务实现也可以达到目的,但由于
增加了多个MapReduce作业,将增加整个作业的处理周期,而且还会增加很多I/O操作,因此处理效率不高。
Hadoop为此提供了专门的链式Mapper(ChainMapper)和链式Reducer(ChainReducer)来完成这种处理。
ChainMapper允许在一个单一Map任务中添加和使用多个Map子任务;而ChainReducer则允许在一个单一Reduce任务执行了Reduce处理
后,继续使用多个Map子任务完成一些后续处理。
- package com.hadoop.mapreduce;
- import java.io.IOException;
- import java.util.StringTokenizer;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.FileSystem;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.chain.ChainMapper;
- import org.apache.hadoop.mapreduce.lib.chain.ChainReducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.util.GenericOptionsParser;
- public class Chain {
- // 第一个Job
- public static class Map1 extends Mapper<LongWritable, Text, Text, IntWritable>{
- Text word = new Text();
- @Override
- protected void map(LongWritable key, Text value,Context context)
- throws IOException, InterruptedException {
- StringTokenizer st = new StringTokenizer(value.toString());
- while(st.hasMoreTokens()){
- word.set(st.nextToken());
- context.write(word, new IntWritable(1));
- }
- }
- }
- public static class Reduce1 extends Reducer<Text, IntWritable, Text, IntWritable>{
- IntWritable result = new IntWritable();
- @Override
- protected void reduce(Text key, Iterable<IntWritable> values,Context context)
- throws IOException, InterruptedException {
- int sum = 0;
- for(IntWritable val : values){
- sum += val.get();
- }
- result.set(sum);
- context.write(key, result);
- }
- }
- // 第二个Job
- public static class Map2 extends Mapper<Text, IntWritable, Text, IntWritable>{
- Text word = new Text();
- @Override
- protected void map(Text key, IntWritable value,Context context)
- throws IOException, InterruptedException {
- StringTokenizer st = new StringTokenizer(value.toString());
- while(st.hasMoreTokens()){
- word.set(st.nextToken());
- context.write(word, new IntWritable(1));
- }
- }
- }
- public static class Reduce2 extends Reducer<Text, IntWritable, Text, IntWritable>{
- IntWritable result = new IntWritable();
- @Override
- protected void reduce(Text key, Iterable<IntWritable> values,Context context)
- throws IOException, InterruptedException {
- int sum = 0;
- for(IntWritable val : values){
- sum += val.get();
- }
- result.set(sum);
- context.write(key, result);
- }
- }
- // 第三个Job
- public static class Map3 extends Mapper<Text, IntWritable, Text, IntWritable>{
- Text word = new Text();
- @Override
- protected void map(Text key, IntWritable value,Context context)
- throws IOException, InterruptedException {
- StringTokenizer st = new StringTokenizer(value.toString());
- while(st.hasMoreTokens()){
- word.set(st.nextToken());
- context.write(word, new IntWritable(1));
- }
- }
- }
- public static class Reduce3 extends Reducer<Text, IntWritable, Text, IntWritable>{
- IntWritable result = new IntWritable();
- @Override
- protected void reduce(Text key, Iterable<IntWritable> values,Context context)
- throws IOException, InterruptedException {
- int sum = 0;
- for(IntWritable val : values){
- sum += val.get();
- }
- result.set(sum);
- context.write(key, result);
- }
- }
- public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException{
- String inPath1 = "hdfs://hadoop0:9000/user/root/input/";
- String outPath1 = "hdfs://hadoop0:9000/user/root/3DZout/";
- String outPath2 = "hdfs://hadoop0:9000/user/root/3DZout2/";
- String outPath3 = "hdfs://hadoop0:9000/user/root/3DZout3/";
- String[] inOut = {inPath1, outPath1};
- Configuration conf = new Configuration();
- String[] otherArgs = new GenericOptionsParser(conf, inOut).getRemainingArgs();
- if (otherArgs.length < 2) {
- System.err.println("Usage: wordcount <in> [<in>...] <out>");
- System.exit(2);
- }
- // 判断输出路径是否存在,如存在先删除
- FileSystem hdfs = FileSystem.get(conf);
- Path findFile = new Path(outPath1);
- boolean isExists = hdfs.exists(findFile);
- if(isExists){
- hdfs.delete(findFile, true);
- }
- if(hdfs.exists(new Path(outPath2))){
- hdfs.delete(new Path(outPath2), true);
- }
- if(hdfs.exists(new Path(outPath3))){
- hdfs.delete(new Path(outPath3), true);
- }
- // job1配置
- Job job1 = Job.getInstance(conf);
- job1.setJarByClass(Chain.class);
- job1.setJobName("ChainJob");
- FileInputFormat.addInputPath(job1, new Path(inPath1));
- FileOutputFormat.setOutputPath(job1, new Path(outPath1));
- // 连式编程要注意的是,可以有多个个Mapper,且后面Mapper的输入是是上一个Mapper的输出,最后一个Mapper的输出是Reducer的输入,
- // 但全局只有一个Reducer
- ChainMapper.addMapper(job1, Map1.class, LongWritable.class, Text.class, Text.class, IntWritable.class, conf);
- ChainMapper.addMapper(job1, Map2.class, Text.class, IntWritable.class, Text.class, IntWritable.class, conf);
- ChainMapper.addMapper(job1, Map3.class, Text.class, IntWritable.class, Text.class, IntWritable.class, conf);
- // 执行顺序 map1 --> map2 --> map3 --> reduce1
- ChainReducer.setReducer(job1, Reduce1.class, Text.class, IntWritable.class, Text.class, IntWritable.class, conf);
- job1.waitForCompletion(true);
- }
- }