有时候一个复杂的任务不能只用一个MapReduce作业完成,需要分解成多个子任务,每个子任务是一个单独的MapReduce作业。这个时候就需要将多个MapReduce作业连接起来
让你真正明白什么是MapReduce组合式,迭代式,链式
多个mapreduce工作相互依赖处理方法完整实例(JobControl)
Hadoop 依赖关系作业 MapReduce JobControl 错误纠正
1. 顺序式
在MapReduce的迭代思想,类似for循环,前一个 MapReduce的输出结果,作为下一个 MapReduce的输入,任务完成后中间结果都可以删除。
Configuration conf1 = new Configuration();
Job job1 = new Job(conf1,"job1");
.....
FileInputFormat.addInputPath(job1,InputPaht1);
FileOutputFromat.setOoutputPath(job1,Outpath1);
job1.waitForCompletion(true);
//sub Mapreduce
Configuration conf2 = new Configuration();
Job job2 = new Job(conf1,"job1");
.....
FileInputFormat.addInputPath(job2,Outpath1);
FileOutputFromat.setOoutputPath(job2,Outpath2);
job2.waitForCompletion(true);
//sub Mapreduce
Configuration conf3 = new Configuration();
Job job3 = new Job(conf1,"job1");
.....
FileInputFormat.addInputPath(job3,Outpath2);
FileOutputFromat.setOoutputPath(job3,Outpath3);
job3.waitForCompletion(true);
.....
- 具有依赖关系的MapReduce作业
我们可以设想一下MapReduce有3个子任务job1,job2,job3构成,其中job1和job2相互独立,job3要在job1和job2完成之后才执行。这种关系就叫复杂数据依赖关系的组合时mapreduce。hadoop为这种组合关系提供了一种执行和控制机制,hadoop通过job和jobControl类提供具体的编程方法。Job除了维护子任务的配置信息,还维护子任务的依赖关系,而jobControl控制整个作业流程,把所有的子任务作业加入到JobControl中,执行JobControl的run()方法即可运行程序。
Configuration job1conf = new Configuration();
Job job1 = new Job(job1conf,"Job1");
.........//job1 其他设置
Configuration job2conf = new Configuration();
Job job2 = new Job(job2conf,"Job2");
.........//job2 其他设置
Configuration job3conf = new Configuration();
Job job3 = new Job(job3conf,"Job3");
.........//job3 其他设置
job3.addDepending(job1);//设置job3和job1的依赖关系
job3.addDepending(job2);
JobControl JC = new JobControl("123");
JC.addJob(job1);//把三个job加入到jobcontorl中
JC.addJob(job2);
JC.addJob(job3);
JC.run();
完整示例:
输入格式为:X,Y,表示Y被X引用,现想统计,被引用次数最高的K个
第一个mapreduce作业统计每个专利被引用多少次,第二个mapreduce实现topk
package Chapter5;
import java.io.IOException;
import java.util.TreeMap;
import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class DependingJoin extends Configured implements Tool {
public static class MapClass1 extends Mapper<Text, Text, Text, IntWritable>{
private static IntWritable one=new IntWritable(1);
public void map(Text key,Text value,Context context)throws IOException,InterruptedException{
context.write(value, one);
}
}
public static class Reduce1 extends Reducer<Text, IntWritable, Text,IntWritable>{
public void reduce(Text key,Iterable<IntWritable> values,Context context)throws IOException,InterruptedException{
int sum=0;
for(IntWritable x : values){
sum+=x.get();
}
context.write(key, new IntWritable(sum));
}
}
private static Integer K=10;
public static class MapClass2 extends Mapper<Text, Text, IntWritable, Text>{
TreeMap<Integer, String> map=new TreeMap<Integer,String>();
public void map(Text key,Text value,Context context) throws IOException,InterruptedException{
if(key!=null&&value!=null){
String patent=key.toString();
Integer num=Integer.parseInt(value.toString());
map.put(num, patent);
if(map.size()>K){
map.remove(map.firstKey());
}
}
}
@Override
protected void cleanup(Mapper<Text, Text, IntWritable, Text>.Context context)throws IOException,InterruptedException{
for(Integer num:map.keySet()){
context.write(new IntWritable(num) ,new Text(map.get(num)));
}
}
}
public static class Reduce2 extends Reducer<IntWritable, Text, Text, IntWritable>{
TreeMap<Integer, String> map=new TreeMap<Integer,String>();
public void reduce(IntWritable key,Iterable<Text> value,Context context)throws IOException,InterruptedException{
for(Text text : value){
map.put(key.get(), text.toString());
if(map.size()>K){
map.remove(map.firstKey());
}
}
}
@Override
protected void cleanup(Reducer<IntWritable, Text, Text, IntWritable>.Context context)throws IOException,InterruptedException{
for(Integer num:map.keySet()){
context.write(new Text(map.get(num)),new IntWritable(num));
}
}
}
@Override
public int run(String[] arg0) throws Exception {
// TODO Auto-generated method stub
//job1
Configuration configuration1=getConf();
configuration1.set("key.value.separator.in.input.line", ",");
configuration1.set("mapred.textoutputformat.separator", "\t");
Job job1=new Job(configuration1, "citenum");
FileInputFormat.addInputPath(job1, new Path(arg0[0]));
FileOutputFormat.setOutputPath(job1, new Path(arg0[1]));
job1.setJarByClass(DependingJoin.class);
job1.setMapperClass(MapClass1.class);
job1.setReducerClass(Reduce1.class);
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(IntWritable.class);
job1.setInputFormatClass(KeyValueTextInputFormat.class);
job1.setOutputFormatClass(TextOutputFormat.class);
ControlledJob ctrjob1=new ControlledJob(configuration1);
ctrjob1.setJob(job1);
//job2
Configuration configuration2=getConf();
configuration2.set("key.value.separator.in.input.line", "\t");
Job job2=new Job(configuration2, "citenum2");
FileInputFormat.addInputPath(job2, new Path(arg0[1]));
FileOutputFormat.setOutputPath(job2, new Path(arg0[2]));
job2.setJarByClass(DependingJoin.class);
job2.setMapperClass(MapClass2.class);
job2.setReducerClass(Reduce2.class);
job2.setOutputKeyClass(Text.class);
job2.setOutputValueClass(IntWritable.class);
job2.setMapOutputKeyClass(IntWritable.class);
job2.setMapOutputValueClass(Text.class);
job2.setInputFormatClass(KeyValueTextInputFormat.class);
job2.setOutputFormatClass(TextOutputFormat.class);
ControlledJob ctrjob2=new ControlledJob(configuration2);
ctrjob2.setJob(job2);
//depend
ctrjob2.addDependingJob(ctrjob1);
JobControl jobControl=new JobControl("myjob");
jobControl.addJob(ctrjob1);
jobControl.addJob(ctrjob2);
Thread thread=new Thread(jobControl);
thread.start();
while(true){
if(jobControl.allFinished()){
System.out.println(jobControl.getSuccessfulJobList());
jobControl.stop();
break;
}
}
return 0;
}
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
int res=ToolRunner.run(new Configuration(), new DependingJoin(), args);
System.exit(res);
}
}
- 预处理和后处理阶段的链接
首先看一下例子,来说明为什么要有链式MapReduce,假设在统计单词是,会出现这样的词,make,made,making等,他们都属于一个词,在单词累加的时候,都归于一个词。解决的方法为用一个单独的Mapreduce任务可以实现,单增加了多个Mapreduce作业,将增加整个作业处理的周期,还增加了I/O操作,因而处理效率不高。一个较好的办法就是在核心的MapReduce之外,增加一个辅助的Map过程,然后将这个辅助的Map过程和核心的Mapreudce过程合并为一个链式的Mapreduce,从而完成整个作业。hadoop提供了专门的链式ChainMapper和ChainReducer来处理链式任务,ChainMapper允许一个Map任务中添加多个Map的子任务,ChainReducer可以在Reducer执行之后,在加入多个Map的子任务。
package Chapter5;
import java.io.IOException;
import java.util.TreeMap;
import java.util.regex.Pattern;
import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.chain.ChainMapper;
import org.apache.hadoop.mapreduce.lib.chain.ChainReducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.util.bloom.BloomFilter;
import com.sun.jersey.server.impl.model.method.dispatch.VoidVoidDispatchProvider;
import com.sun.scenario.effect.Bloom;
import com.sun.security.auth.callback.TextCallbackHandler;
public class ChainJob extends Configured implements Tool {
public static class MapClass1 extends Mapper<Text, Text, Text, Text>{
public void map(Text key,Text value,Context context)throws IOException,InterruptedException{
if (key!=null&&Pattern.matches("\\d+", key.toString())) {
//System.out.println(key+"&&&"+value);
context.write(key, value);
}
}
}
public static class MapClass2 extends Mapper<Text, Text, Text, Text>{
public void map(Text key,Text value,Context context)throws IOException,InterruptedException{
context.write(value, key);
}
}
public static class Reduce extends Reducer<Text, Text, Text,Text>{
public void reduce(Text key,Iterable<Text> values,Context context)throws IOException,InterruptedException{
String res="";
for(Text x : values){
if(res.length()>0)res+=",";
res+=x.toString();
}
context.write(key, new Text(res));
}
}
public static class MapClass3 extends Mapper<Text, Text, Text, IntWritable>{
public void map(Text key,Text value,Context context)throws IOException,InterruptedException{
String tokens []=value.toString().split(",");
int sum=tokens.length;
context.write(key, new IntWritable(sum));
}
}
@Override
public int run(String[] arg0) throws Exception {
// TODO Auto-generated method stub
Configuration configuration=getConf();
configuration.set("key.value.separator.in.input.line", ",");
Job job=new Job(configuration,"chainjob");
job.setJarByClass(ChainJob.class);
job.setInputFormatClass(KeyValueTextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path(arg0[0]));
FileOutputFormat.setOutputPath(job, new Path(arg0[1]));
ChainMapper.addMapper(job, MapClass1.class, Text.class, Text.class, Text.class, Text.class, new Configuration());
ChainMapper.addMapper(job, MapClass2.class, Text.class, Text.class, Text.class, Text.class, new Configuration());
ChainReducer.setReducer(job, Reduce.class, Text.class, Text.class, Text.class, Text.class, new Configuration());
ChainReducer.addMapper(job, MapClass3.class, Text.class, Text.class, Text.class, IntWritable.class, new Configuration());
System.exit(job.waitForCompletion(true)?0:1);
return 0;
}
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
int res=ToolRunner.run(new Configuration(), new ChainJob(), args);
System.exit(res);
}
}