含义:怎样是多个MapReduce作业连起来执行;
许多复杂的任务需要分解成简单任务,每个子任务通过MapReduce作业来完成。
Hadoop支持将多个MapReduce链接成更大的作业
多个MapReduce vs 复杂的Map和Reduce
例:
从数据集中找到10个被引用最多的专利
Hadoop1:倒排,统计引用数
Hadoop2: 寻找最大的10个
MapReduce作业按照顺序链接在一起
类似于Unix中的管道:
mapreduce-1 | mapreduce-2 | mapreduce-3 …
在driver中为每一个阶段创建一个job,并将当前输入路径设为前一个的输出
Job1.waitForCompletion(true);
Job2.waitForCompletion(true);
具有依赖的MapReduce链接(非线性)
若mapreduce-1处理一个数据集,MapReduce-2处理另一个数据集,而MapReduce-3对前两个做内部连结每一个阶段
这种情况通过ControlledJob和JobControl类管理非线性作业间的依赖
包org.apache.hadoop.mapreduce.lib.jobcontrol #javadoc#
JobControl This class encapsulates a set of MapReduce jobs and its dependency.
ControlledJob 封装一个JobConf对象(配置作业) 添加依赖关系 x.addDependingJob(y) 意味着x在y完成前不会启动
JobControl 管理并监视作业的执行 addJob() 方法,添加作业 run()方法,主线程,不断更新作业状态,提交准备好的作业
#例子1#
package ex7;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class LinkedMR {
private static final Text TEXT_SUM = new Text("SUM");
private static final Text TEXT_COUNT = new Text("COUNT");
private static final Text TEXT_AVG = new Text("AVG");
// 计算SumMapper,多单个Map中的数据求和
public static class SumMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
public long sum = 0;
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
sum += Long.parseLong(value.toString());
}
protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(TEXT_SUM, new LongWritable(sum));
}
}
// 计算SumReducer,求和
public static class SumReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
public long sum = 0;
public void reduce(Text key, Iterable<LongWritable> values, Context context)
throws IOException, InterruptedException {
for (LongWritable v : values) {
sum += v.get();
}
context.write(TEXT_SUM, new LongWritable(sum));
}
}
// 计算CountMapper,统计数量
public static class CountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
public long count = 0;
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
count += 1;
}
protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(TEXT_COUNT, new LongWritable(count));
}
}
// 计算CountReducer,统计数量
public static class CountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
public long count = 0;
public void reduce(Text key, Iterable<LongWritable> values, Context context)
throws IOException, InterruptedException {
for (LongWritable v : values) {
count += v.get();
}
context.write(TEXT_COUNT, new LongWritable(count));
}
}
// 计算平均数MR,Map
public static class AvgMapper extends Mapper<LongWritable, Text, LongWritable, LongWritable> {
public long count = 0;
public long sum = 0;
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] v = value.toString().split("\t");
if (v[0].equals("COUNT")) {
count = Long.parseLong(v[1]);
} else if (v[0].equals("SUM")) {
sum = Long.parseLong(v[1]);
}
}
protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(new LongWritable(sum), new LongWritable(count));
}
}
// 计算平均数MR,Reduce
public static class AvgReducer extends Reducer<LongWritable, LongWritable, Text, DoubleWritable> {
public long sum = 0;
public long count = 0;
public void reduce(LongWritable key, Iterable<LongWritable> values, Context context)
throws IOException, InterruptedException {
sum += key.get();
for (LongWritable v : values) {
count += v.get();
}
}
protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(TEXT_AVG, new DoubleWritable(new Double(sum) / count));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String inputPath = "testdata/example_1";
String sumOutputPath = "testdata/example1_out/sum";
String countOutputPath = "testdata/example1_out/count";
String avgOutputPath = "testdata/example1_out/avg";
Job job1 = Job.getInstance(conf, "Sum");
job1.setJarByClass(LinkedMR.class);
job1.setMapperClass(SumMapper.class);
job1.setCombinerClass(SumReducer.class);
job1.setReducerClass(SumReducer.class);
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(LongWritable.class);
FileInputFormat.addInputPath(job1, new Path(inputPath));
FileOutputFormat.setOutputPath(job1, new Path(sumOutputPath));
Job job2 = Job.getInstance(conf, "Count");
job2.setJarByClass(LinkedMR.class);
job2.setMapperClass(CountMapper.class);
job2.setCombinerClass(CountReducer.class);
job2.setReducerClass(CountReducer.class);
job2.setOutputKeyClass(Text.class);
job2.setOutputValueClass(LongWritable.class);
FileInputFormat.addInputPath(job2, new Path(inputPath));
FileOutputFormat.setOutputPath(job2, new Path(countOutputPath));
Job job3 = Job.getInstance(conf, "Average");
job3.setJarByClass(LinkedMR.class);
job3.setMapperClass(AvgMapper.class);
job3.setReducerClass(AvgReducer.class);
job3.setMapOutputKeyClass(LongWritable.class);
job3.setMapOutputValueClass(LongWritable.class);
job3.setOutputKeyClass(Text.class);
job3.setOutputValueClass(DoubleWritable.class);
// 将job1及job2的输出为做job3的输入
FileInputFormat.addInputPath(job3, new Path(sumOutputPath));
FileInputFormat.addInputPath(job3, new Path(countOutputPath));
FileOutputFormat.setOutputPath(job3, new Path(avgOutputPath));
// 提交job1及job2,并等待完成
if (job1.waitForCompletion(true) && job2.waitForCompletion(true)) {
System.exit(job3.waitForCompletion(true) ? 0 : 1);
}
}
}
#例子2#
package ex7;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class DependingMR {
private static final Text TEXT_SUM = new Text("SUM");
private static final Text TEXT_COUNT = new Text("COUNT");
private static final Text TEXT_AVG = new Text("AVG");
// 计算SumMapper,多单个Map中的数据求和
public static class SumMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
public long sum = 0;
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
sum += Long.parseLong(value.toString());
}
protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(TEXT_SUM, new LongWritable(sum));
}
}
// 计算SumReducer,求和
public static class SumReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
public long sum = 0;
public void reduce(Text key, Iterable<LongWritable> values, Context context)
throws IOException, InterruptedException {
for (LongWritable v : values) {
sum += v.get();
}
context.write(TEXT_SUM, new LongWritable(sum));
}
}
// 计算CountMapper,统计数量
public static class CountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
public long count = 0;
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
count += 1;
}
protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(TEXT_COUNT, new LongWritable(count));
}
}
// 计算CountReducer,统计数量
public static class CountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
public long count = 0;
public void reduce(Text key, Iterable<LongWritable> values, Context context)
throws IOException, InterruptedException {
for (LongWritable v : values) {
count += v.get();
}
context.write(TEXT_COUNT, new LongWritable(count));
}
}
// 计算平均数MR,Map
public static class AvgMapper extends Mapper<LongWritable, Text, LongWritable, LongWritable> {
public long count = 0;
public long sum = 0;
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] v = value.toString().split("\t");
if (v[0].equals("COUNT")) {
count = Long.parseLong(v[1]);
} else if (v[0].equals("SUM")) {
sum = Long.parseLong(v[1]);
}
}
protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(new LongWritable(sum), new LongWritable(count));
}
}
// 计算平均数MR,Reduce
public static class AvgReducer extends Reducer<LongWritable, LongWritable, Text, DoubleWritable> {
public long sum = 0;
public long count = 0;
public void reduce(LongWritable key, Iterable<LongWritable> values, Context context)
throws IOException, InterruptedException {
sum += key.get();
for (LongWritable v : values) {
count += v.get();
}
}
protected void cleanup(Context context) throws IOException, InterruptedException {
context.write(TEXT_AVG, new DoubleWritable(new Double(sum) / count));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String inputPath = "testdata/lab4";
String sumOutputPath = "testdata/lab4-out/sum";
String countOutputPath = "testdata/lab4-out/count";
String avgOutputPath = "testdata/lab4-out/avg";
Job job1 = Job.getInstance(conf, "Sum");
job1.setJarByClass(DependingMR.class);
job1.setMapperClass(SumMapper.class);
job1.setCombinerClass(SumReducer.class);
job1.setReducerClass(SumReducer.class);
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(LongWritable.class);
FileInputFormat.addInputPath(job1, new Path(inputPath));
FileOutputFormat.setOutputPath(job1, new Path(sumOutputPath));
Job job2 = Job.getInstance(conf, "Count");
job2.setJarByClass(DependingMR.class);
job2.setMapperClass(CountMapper.class);
job2.setCombinerClass(CountReducer.class);
job2.setReducerClass(CountReducer.class);
job2.setOutputKeyClass(Text.class);
job2.setOutputValueClass(LongWritable.class);
FileInputFormat.addInputPath(job2, new Path(inputPath));
FileOutputFormat.setOutputPath(job2, new Path(countOutputPath));
Job job3 = Job.getInstance(conf, "Average");
job3.setJarByClass(DependingMR.class);
job3.setMapperClass(AvgMapper.class);
job3.setReducerClass(AvgReducer.class);
job3.setMapOutputKeyClass(LongWritable.class);
job3.setMapOutputValueClass(LongWritable.class);
job3.setOutputKeyClass(Text.class);
job3.setOutputValueClass(DoubleWritable.class);
// 将job1及job2的输出作为job3的输入
FileInputFormat.addInputPath(job3, new Path(sumOutputPath));
FileInputFormat.addInputPath(job3, new Path(countOutputPath));
FileOutputFormat.setOutputPath(job3, new Path(avgOutputPath));
ControlledJob contlJob1=new ControlledJob(conf);
contlJob1.setJob(job1);
ControlledJob contlJob2=new ControlledJob(conf);
contlJob1.setJob(job2);
ControlledJob contlJob3=new ControlledJob(conf);
contlJob1.setJob(job3);
contlJob3.addDependingJob(contlJob1);
contlJob3.addDependingJob(contlJob2);
JobControl jobContl=new JobControl("depending job");
jobContl.addJob(contlJob1);
jobContl.addJob(contlJob2);
jobContl.addJob(contlJob3);
Thread t=new Thread(jobContl);
t.start();
if(jobContl.allFinished()) {
System.out.println(jobContl.getSuccessfulJobList());
jobContl.stop();
}
// // �ύjob1��job2,���ȴ����
// if (job1.waitForCompletion(true) && job2.waitForCompletion(true)) {
// System.exit(job3.waitForCompletion(true) ? 0 : 1);
// }
}
}
Job预处理和后处理
Mapreduce链接的方式
[MAP | REDUCE] +
每个作业的中间结果需要占用I/O和存储资源
预处理和后处理的链接
MAP+ | REDUCE | MAP*
Map1 | Map2 | Reduce | Map3 | Map4
把Map2和Reduce视为MapReduce作业核心。Map1、2作为预处理,Map3、4作为后处理