链接hadoop作业

有时候一个复杂的任务不能只用一个MapReduce作业完成,需要分解成多个子任务,每个子任务是一个单独的MapReduce作业。这个时候就需要将多个MapReduce作业连接起来

让你真正明白什么是MapReduce组合式,迭代式,链式
多个mapreduce工作相互依赖处理方法完整实例(JobControl)
Hadoop 依赖关系作业 MapReduce JobControl 错误纠正
1. 顺序式
在MapReduce的迭代思想,类似for循环,前一个 MapReduce的输出结果,作为下一个 MapReduce的输入,任务完成后中间结果都可以删除。

Configuration conf1 = new Configuration();
Job job1 = new Job(conf1,"job1");
.....
FileInputFormat.addInputPath(job1,InputPaht1);
FileOutputFromat.setOoutputPath(job1,Outpath1);
job1.waitForCompletion(true);
//sub Mapreduce
Configuration conf2 = new Configuration();
Job job2 = new Job(conf1,"job1");
.....
FileInputFormat.addInputPath(job2,Outpath1);
FileOutputFromat.setOoutputPath(job2,Outpath2);
job2.waitForCompletion(true);
//sub Mapreduce
Configuration conf3 = new Configuration();
Job job3 = new Job(conf1,"job1");
.....
FileInputFormat.addInputPath(job3,Outpath2);
FileOutputFromat.setOoutputPath(job3,Outpath3);
job3.waitForCompletion(true);
.....
  1. 具有依赖关系的MapReduce作业
    我们可以设想一下MapReduce有3个子任务job1,job2,job3构成,其中job1和job2相互独立,job3要在job1和job2完成之后才执行。这种关系就叫复杂数据依赖关系的组合时mapreduce。hadoop为这种组合关系提供了一种执行和控制机制,hadoop通过job和jobControl类提供具体的编程方法。Job除了维护子任务的配置信息,还维护子任务的依赖关系,而jobControl控制整个作业流程,把所有的子任务作业加入到JobControl中,执行JobControl的run()方法即可运行程序。
Configuration job1conf = new Configuration();
Job job1 = new Job(job1conf,"Job1");
.........//job1 其他设置
Configuration job2conf = new Configuration();
Job job2 = new Job(job2conf,"Job2");
.........//job2 其他设置
Configuration job3conf = new Configuration();
Job job3 = new Job(job3conf,"Job3");
.........//job3 其他设置
job3.addDepending(job1);//设置job3和job1的依赖关系
job3.addDepending(job2);
JobControl JC = new JobControl("123");
JC.addJob(job1);//把三个job加入到jobcontorl中
JC.addJob(job2);
JC.addJob(job3);
JC.run();

完整示例:
输入格式为:X,Y,表示Y被X引用,现想统计,被引用次数最高的K个
第一个mapreduce作业统计每个专利被引用多少次,第二个mapreduce实现topk

package Chapter5;

import java.io.IOException;
import java.util.TreeMap;

import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class DependingJoin extends Configured implements Tool {

    public static class MapClass1 extends Mapper<Text, Text, Text, IntWritable>{

        private static IntWritable one=new IntWritable(1);
        public void map(Text key,Text value,Context context)throws IOException,InterruptedException{

            context.write(value, one);
        }
    }

    public static class Reduce1 extends Reducer<Text, IntWritable, Text,IntWritable>{

        public void reduce(Text key,Iterable<IntWritable> values,Context context)throws IOException,InterruptedException{

            int sum=0;
            for(IntWritable x : values){
                sum+=x.get();
            }
            context.write(key, new IntWritable(sum));
        }
    }
    private static Integer K=10;
    public static class MapClass2 extends Mapper<Text, Text, IntWritable, Text>{

        TreeMap<Integer, String> map=new TreeMap<Integer,String>();
        public void map(Text key,Text value,Context context) throws IOException,InterruptedException{
            if(key!=null&&value!=null){
                String patent=key.toString();
                Integer num=Integer.parseInt(value.toString());
                map.put(num, patent);
                if(map.size()>K){
                    map.remove(map.firstKey());
                }
            }
        }
        @Override
        protected void cleanup(Mapper<Text, Text, IntWritable, Text>.Context context)throws IOException,InterruptedException{

            for(Integer num:map.keySet()){
                context.write(new IntWritable(num) ,new Text(map.get(num)));
            }
        }
    }

    public static class Reduce2  extends Reducer<IntWritable, Text, Text, IntWritable>{

        TreeMap<Integer, String> map=new TreeMap<Integer,String>();
        public void reduce(IntWritable key,Iterable<Text> value,Context context)throws IOException,InterruptedException{

            for(Text text : value){
                map.put(key.get(), text.toString());
                if(map.size()>K){
                    map.remove(map.firstKey());
                }
            }
        }
        @Override
        protected void cleanup(Reducer<IntWritable, Text, Text, IntWritable>.Context context)throws IOException,InterruptedException{

            for(Integer num:map.keySet()){
                context.write(new Text(map.get(num)),new IntWritable(num));
            }

        }
    }

    @Override
    public int run(String[] arg0) throws Exception {
        // TODO Auto-generated method stub
        //job1
        Configuration configuration1=getConf();
        configuration1.set("key.value.separator.in.input.line", ",");
        configuration1.set("mapred.textoutputformat.separator", "\t");

        Job job1=new Job(configuration1, "citenum");
        FileInputFormat.addInputPath(job1, new Path(arg0[0]));
        FileOutputFormat.setOutputPath(job1, new Path(arg0[1]));

        job1.setJarByClass(DependingJoin.class);
        job1.setMapperClass(MapClass1.class);
        job1.setReducerClass(Reduce1.class);
        job1.setOutputKeyClass(Text.class);
        job1.setOutputValueClass(IntWritable.class);
        job1.setInputFormatClass(KeyValueTextInputFormat.class);
        job1.setOutputFormatClass(TextOutputFormat.class);
        ControlledJob ctrjob1=new ControlledJob(configuration1);
        ctrjob1.setJob(job1);

        //job2
        Configuration configuration2=getConf();
        configuration2.set("key.value.separator.in.input.line", "\t");
        Job job2=new Job(configuration2, "citenum2");
        FileInputFormat.addInputPath(job2, new Path(arg0[1]));
        FileOutputFormat.setOutputPath(job2, new Path(arg0[2]));

        job2.setJarByClass(DependingJoin.class);
        job2.setMapperClass(MapClass2.class);
        job2.setReducerClass(Reduce2.class);
        job2.setOutputKeyClass(Text.class);
        job2.setOutputValueClass(IntWritable.class);
        job2.setMapOutputKeyClass(IntWritable.class);
        job2.setMapOutputValueClass(Text.class);
        job2.setInputFormatClass(KeyValueTextInputFormat.class);
        job2.setOutputFormatClass(TextOutputFormat.class);
        ControlledJob ctrjob2=new ControlledJob(configuration2);
        ctrjob2.setJob(job2);

        //depend
        ctrjob2.addDependingJob(ctrjob1);

        JobControl jobControl=new JobControl("myjob");

        jobControl.addJob(ctrjob1);
        jobControl.addJob(ctrjob2);

        Thread thread=new Thread(jobControl);
        thread.start();
        while(true){
            if(jobControl.allFinished()){
                System.out.println(jobControl.getSuccessfulJobList());
                jobControl.stop();
                break;
            }
        }
        return 0;
    }

    public static void main(String[] args) throws Exception {
        // TODO Auto-generated method stub
        int res=ToolRunner.run(new Configuration(), new DependingJoin(), args);
        System.exit(res);
    }
}
  1. 预处理和后处理阶段的链接
    首先看一下例子,来说明为什么要有链式MapReduce,假设在统计单词是,会出现这样的词,make,made,making等,他们都属于一个词,在单词累加的时候,都归于一个词。解决的方法为用一个单独的Mapreduce任务可以实现,单增加了多个Mapreduce作业,将增加整个作业处理的周期,还增加了I/O操作,因而处理效率不高。一个较好的办法就是在核心的MapReduce之外,增加一个辅助的Map过程,然后将这个辅助的Map过程和核心的Mapreudce过程合并为一个链式的Mapreduce,从而完成整个作业。hadoop提供了专门的链式ChainMapper和ChainReducer来处理链式任务,ChainMapper允许一个Map任务中添加多个Map的子任务,ChainReducer可以在Reducer执行之后,在加入多个Map的子任务。
package Chapter5;

import java.io.IOException;
import java.util.TreeMap;
import java.util.regex.Pattern;

import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.chain.ChainMapper;
import org.apache.hadoop.mapreduce.lib.chain.ChainReducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.util.bloom.BloomFilter;

import com.sun.jersey.server.impl.model.method.dispatch.VoidVoidDispatchProvider;
import com.sun.scenario.effect.Bloom;
import com.sun.security.auth.callback.TextCallbackHandler;


public class ChainJob extends Configured implements Tool {

    public static class MapClass1 extends Mapper<Text, Text, Text, Text>{

        public void map(Text key,Text value,Context context)throws IOException,InterruptedException{

            if (key!=null&&Pattern.matches("\\d+", key.toString())) {
                //System.out.println(key+"&&&"+value);
                context.write(key, value);
            }
        }
    }
    public static class MapClass2 extends Mapper<Text, Text, Text, Text>{
        public void map(Text key,Text value,Context context)throws IOException,InterruptedException{

            context.write(value, key);
        }
    }

    public static class Reduce extends Reducer<Text, Text, Text,Text>{

        public void reduce(Text key,Iterable<Text> values,Context context)throws IOException,InterruptedException{

            String res="";
            for(Text x : values){
                if(res.length()>0)res+=",";
                res+=x.toString();
            }
            context.write(key, new Text(res));
        }
    }
    public static class MapClass3 extends Mapper<Text, Text, Text, IntWritable>{
        public void map(Text key,Text value,Context context)throws IOException,InterruptedException{

            String tokens []=value.toString().split(",");
            int sum=tokens.length;
            context.write(key, new IntWritable(sum));
        }
    }

    @Override
    public int run(String[] arg0) throws Exception {
        // TODO Auto-generated method stub

        Configuration configuration=getConf();
        configuration.set("key.value.separator.in.input.line", ",");
        Job job=new Job(configuration,"chainjob");
        job.setJarByClass(ChainJob.class);
        job.setInputFormatClass(KeyValueTextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        FileInputFormat.setInputPaths(job, new Path(arg0[0]));
        FileOutputFormat.setOutputPath(job, new Path(arg0[1]));

        ChainMapper.addMapper(job, MapClass1.class, Text.class, Text.class, Text.class, Text.class, new Configuration());
        ChainMapper.addMapper(job, MapClass2.class, Text.class, Text.class, Text.class, Text.class, new Configuration());
        ChainReducer.setReducer(job, Reduce.class, Text.class, Text.class, Text.class, Text.class, new Configuration());
        ChainReducer.addMapper(job, MapClass3.class, Text.class, Text.class, Text.class, IntWritable.class, new Configuration());
        System.exit(job.waitForCompletion(true)?0:1);
        return 0;
    }

    public static void main(String[] args) throws Exception {
        // TODO Auto-generated method stub
        int res=ToolRunner.run(new Configuration(), new ChainJob(), args);
        System.exit(res);
    }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值