Hadoop——MapReduce作业链接

含义:怎样是多个MapReduce作业连起来执行;

许多复杂的任务需要分解成简单任务,每个子任务通过MapReduce作业来完成。
Hadoop支持将多个MapReduce链接成更大的作业
多个MapReduce vs 复杂的Map和Reduce
例:
从数据集中找到10个被引用最多的专利
Hadoop1:倒排,统计引用数
Hadoop2: 寻找最大的10个

MapReduce作业按照顺序链接在一起
类似于Unix中的管道:
mapreduce-1 | mapreduce-2 | mapreduce-3 …

在driver中为每一个阶段创建一个job,并将当前输入路径设为前一个的输出
Job1.waitForCompletion(true);
Job2.waitForCompletion(true);

具有依赖的MapReduce链接(非线性)
若mapreduce-1处理一个数据集,MapReduce-2处理另一个数据集,而MapReduce-3对前两个做内部连结每一个阶段
这种情况通过ControlledJob和JobControl类管理非线性作业间的依赖
包org.apache.hadoop.mapreduce.lib.jobcontrol #javadoc#
JobControl This class encapsulates a set of MapReduce jobs and its dependency.


ControlledJob 封装一个JobConf对象(配置作业) 添加依赖关系 x.addDependingJob(y) 意味着x在y完成前不会启动
JobControl 管理并监视作业的执行 addJob() 方法,添加作业 run()方法,主线程,不断更新作业状态,提交准备好的作业

#例子1#

package ex7;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;

public class LinkedMR {
	private static final Text TEXT_SUM = new Text("SUM");
	private static final Text TEXT_COUNT = new Text("COUNT");
	private static final Text TEXT_AVG = new Text("AVG");

	// 计算SumMapper,多单个Map中的数据求和
	public static class SumMapper extends Mapper<LongWritable, Text, Text, LongWritable> {

		public long sum = 0;

		public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			sum += Long.parseLong(value.toString());
		}

		protected void cleanup(Context context) throws IOException, InterruptedException {
			context.write(TEXT_SUM, new LongWritable(sum));
		}
	}

	// 计算SumReducer,求和
	public static class SumReducer extends Reducer<Text, LongWritable, Text, LongWritable> {

		public long sum = 0;

		public void reduce(Text key, Iterable<LongWritable> values, Context context)
				throws IOException, InterruptedException {
			for (LongWritable v : values) {
				sum += v.get();
			}
			context.write(TEXT_SUM, new LongWritable(sum));
		}
	}

	// 计算CountMapper,统计数量
	public static class CountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {

		public long count = 0;

		public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			count += 1;
		}

		protected void cleanup(Context context) throws IOException, InterruptedException {
			context.write(TEXT_COUNT, new LongWritable(count));
		}
	}

	// 计算CountReducer,统计数量
	public static class CountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {

		public long count = 0;

		public void reduce(Text key, Iterable<LongWritable> values, Context context)
				throws IOException, InterruptedException {
			for (LongWritable v : values) {
				count += v.get();
			}
			context.write(TEXT_COUNT, new LongWritable(count));
		}
	}

	
	// 计算平均数MR,Map
	public static class AvgMapper extends Mapper<LongWritable, Text, LongWritable, LongWritable> {

		public long count = 0;
		public long sum = 0;

		public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			String[] v = value.toString().split("\t");
			if (v[0].equals("COUNT")) {
				count = Long.parseLong(v[1]);
			} else if (v[0].equals("SUM")) {
				sum = Long.parseLong(v[1]);
			}
		}

		protected void cleanup(Context context) throws IOException, InterruptedException {
			context.write(new LongWritable(sum), new LongWritable(count));
		}

	}
	
	// 计算平均数MR,Reduce
	public static class AvgReducer extends Reducer<LongWritable, LongWritable, Text, DoubleWritable> {

		public long sum = 0;
		public long count = 0;

		public void reduce(LongWritable key, Iterable<LongWritable> values, Context context)
				throws IOException, InterruptedException {
			sum += key.get();
			for (LongWritable v : values) {
				count += v.get();
			}
		}

		protected void cleanup(Context context) throws IOException, InterruptedException {
			context.write(TEXT_AVG, new DoubleWritable(new Double(sum) / count));
		}

	}

	public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();

		String inputPath = "testdata/example_1";
		String sumOutputPath = "testdata/example1_out/sum";
		String countOutputPath = "testdata/example1_out/count";
		String avgOutputPath = "testdata/example1_out/avg";

		Job job1 = Job.getInstance(conf, "Sum");
		job1.setJarByClass(LinkedMR.class);
		job1.setMapperClass(SumMapper.class);
		job1.setCombinerClass(SumReducer.class);
		job1.setReducerClass(SumReducer.class);
		job1.setOutputKeyClass(Text.class);
		job1.setOutputValueClass(LongWritable.class);
		FileInputFormat.addInputPath(job1, new Path(inputPath));
		FileOutputFormat.setOutputPath(job1, new Path(sumOutputPath));

		Job job2 = Job.getInstance(conf, "Count");
		job2.setJarByClass(LinkedMR.class);
		job2.setMapperClass(CountMapper.class);
		job2.setCombinerClass(CountReducer.class);
		job2.setReducerClass(CountReducer.class);
		job2.setOutputKeyClass(Text.class);
		job2.setOutputValueClass(LongWritable.class);
		FileInputFormat.addInputPath(job2, new Path(inputPath));
		FileOutputFormat.setOutputPath(job2, new Path(countOutputPath));

		Job job3 = Job.getInstance(conf, "Average");
		job3.setJarByClass(LinkedMR.class);
		job3.setMapperClass(AvgMapper.class);
		job3.setReducerClass(AvgReducer.class);
		job3.setMapOutputKeyClass(LongWritable.class);
		job3.setMapOutputValueClass(LongWritable.class);
		job3.setOutputKeyClass(Text.class);
		job3.setOutputValueClass(DoubleWritable.class);

		// 将job1及job2的输出为做job3的输入
		FileInputFormat.addInputPath(job3, new Path(sumOutputPath));
		FileInputFormat.addInputPath(job3, new Path(countOutputPath));
		FileOutputFormat.setOutputPath(job3, new Path(avgOutputPath));

		// 提交job1及job2,并等待完成
		if (job1.waitForCompletion(true) && job2.waitForCompletion(true)) {
			System.exit(job3.waitForCompletion(true) ? 0 : 1);
		}
	}
}

#例子2#

package ex7;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;

public class DependingMR {

	private static final Text TEXT_SUM = new Text("SUM");
	private static final Text TEXT_COUNT = new Text("COUNT");
	private static final Text TEXT_AVG = new Text("AVG");

	// 计算SumMapper,多单个Map中的数据求和
	public static class SumMapper extends Mapper<LongWritable, Text, Text, LongWritable> {

		public long sum = 0;

		public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			sum += Long.parseLong(value.toString());
		}

		protected void cleanup(Context context) throws IOException, InterruptedException {
			context.write(TEXT_SUM, new LongWritable(sum));
		}
	}

	// 计算SumReducer,求和
	public static class SumReducer extends Reducer<Text, LongWritable, Text, LongWritable> {

		public long sum = 0;

		public void reduce(Text key, Iterable<LongWritable> values, Context context)
				throws IOException, InterruptedException {
			for (LongWritable v : values) {
				sum += v.get();
			}
			context.write(TEXT_SUM, new LongWritable(sum));
		}
	}

	// 计算CountMapper,统计数量
	public static class CountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {

		public long count = 0;

		public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			count += 1;
		}

		protected void cleanup(Context context) throws IOException, InterruptedException {
			context.write(TEXT_COUNT, new LongWritable(count));
		}
	}

	// 计算CountReducer,统计数量
	public static class CountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {

		public long count = 0;

		public void reduce(Text key, Iterable<LongWritable> values, Context context)
				throws IOException, InterruptedException {
			for (LongWritable v : values) {
				count += v.get();
			}
			context.write(TEXT_COUNT, new LongWritable(count));
		}
	}

	
	// 计算平均数MR,Map
	public static class AvgMapper extends Mapper<LongWritable, Text, LongWritable, LongWritable> {

		public long count = 0;
		public long sum = 0;

		public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			String[] v = value.toString().split("\t");
			if (v[0].equals("COUNT")) {
				count = Long.parseLong(v[1]);
			} else if (v[0].equals("SUM")) {
				sum = Long.parseLong(v[1]);
			}
		}

		protected void cleanup(Context context) throws IOException, InterruptedException {
			context.write(new LongWritable(sum), new LongWritable(count));
		}

	}
	
	// 计算平均数MR,Reduce
	public static class AvgReducer extends Reducer<LongWritable, LongWritable, Text, DoubleWritable> {

		public long sum = 0;
		public long count = 0;

		public void reduce(LongWritable key, Iterable<LongWritable> values, Context context)
				throws IOException, InterruptedException {
			sum += key.get();
			for (LongWritable v : values) {
				count += v.get();
			}
		}

		protected void cleanup(Context context) throws IOException, InterruptedException {
			context.write(TEXT_AVG, new DoubleWritable(new Double(sum) / count));
		}

	}

	public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();

		String inputPath = "testdata/lab4";
		String sumOutputPath = "testdata/lab4-out/sum";
		String countOutputPath = "testdata/lab4-out/count";
		String avgOutputPath = "testdata/lab4-out/avg";

		Job job1 = Job.getInstance(conf, "Sum");
		job1.setJarByClass(DependingMR.class);
		job1.setMapperClass(SumMapper.class);
		job1.setCombinerClass(SumReducer.class);
		job1.setReducerClass(SumReducer.class);
		job1.setOutputKeyClass(Text.class);
		job1.setOutputValueClass(LongWritable.class);
		FileInputFormat.addInputPath(job1, new Path(inputPath));
		FileOutputFormat.setOutputPath(job1, new Path(sumOutputPath));

		Job job2 = Job.getInstance(conf, "Count");
		job2.setJarByClass(DependingMR.class);
		job2.setMapperClass(CountMapper.class);
		job2.setCombinerClass(CountReducer.class);
		job2.setReducerClass(CountReducer.class);
		job2.setOutputKeyClass(Text.class);
		job2.setOutputValueClass(LongWritable.class);
		FileInputFormat.addInputPath(job2, new Path(inputPath));
		FileOutputFormat.setOutputPath(job2, new Path(countOutputPath));

		Job job3 = Job.getInstance(conf, "Average");
		job3.setJarByClass(DependingMR.class);
		job3.setMapperClass(AvgMapper.class);
		job3.setReducerClass(AvgReducer.class);
		job3.setMapOutputKeyClass(LongWritable.class);
		job3.setMapOutputValueClass(LongWritable.class);
		job3.setOutputKeyClass(Text.class);
		job3.setOutputValueClass(DoubleWritable.class);

		// 将job1及job2的输出作为job3的输入
		FileInputFormat.addInputPath(job3, new Path(sumOutputPath));
		FileInputFormat.addInputPath(job3, new Path(countOutputPath));
		FileOutputFormat.setOutputPath(job3, new Path(avgOutputPath));
		
		ControlledJob contlJob1=new ControlledJob(conf);
		contlJob1.setJob(job1);
		
		ControlledJob contlJob2=new ControlledJob(conf);
		contlJob1.setJob(job2);
		
		ControlledJob contlJob3=new ControlledJob(conf);
		contlJob1.setJob(job3);
		contlJob3.addDependingJob(contlJob1);
		contlJob3.addDependingJob(contlJob2);
		
		JobControl jobContl=new JobControl("depending job");
		jobContl.addJob(contlJob1);
		jobContl.addJob(contlJob2);
		jobContl.addJob(contlJob3);
		
		Thread t=new Thread(jobContl);
		t.start();
		
		if(jobContl.allFinished()) {
			System.out.println(jobContl.getSuccessfulJobList());
			jobContl.stop();
		}

//		// �ύjob1��job2,���ȴ����
//		if (job1.waitForCompletion(true) && job2.waitForCompletion(true)) {
//			System.exit(job3.waitForCompletion(true) ? 0 : 1);
//		}
	}
}

Job预处理和后处理
Mapreduce链接的方式
[MAP | REDUCE] +
每个作业的中间结果需要占用I/O和存储资源


预处理和后处理的链接
MAP+ | REDUCE | MAP*
Map1 | Map2 | Reduce | Map3 | Map4
把Map2和Reduce视为MapReduce作业核心。Map1、2作为预处理,Map3、4作为后处理

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值