Hadoop的JobControl设计及用法

最新推荐文章于 2022-10-20 17:39:41 发布

YancyChang

最新推荐文章于 2022-10-20 17:39:41 发布

阅读量1.6k

点赞数 1

分类专栏： Hadoop

Hadoop 专栏收录该内容

21 篇文章 0 订阅

订阅专栏

JobControl设计及用法

1、JobControl设计原理分析：
JobControl由两个类组成：Job和JobControl。
Job类封装了一个MapReduce作业及其对应的依赖关系，主要负责监控各个依赖作业的运行状态，一次更新自己的状态。
作业刚开始处于WAITING状态。如果没有依赖作业或者所有作业均已运行完成，则进入READY状态。一旦进入REDAY状态，则作业可被提交到Hadoop集群上运行，并进入RUNNING状态。在RUNNING状态下，根据作业运行情况，可能进入SUCCESS或者FAILED状态。
需要注意的是，如果一个作业的依赖作业失败，则该作业也会失败，于是形成“多米诺骨牌效应”，后续所有作业均会失败。

旧API：

新API：

2、JobControl代码实现：

 
     [java]  
     view plaincopy
import java.io.File;  
import java.io.IOException;  
import java.util.HashSet;  
  
import org.apache.hadoop.conf.Configuration;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.io.LongWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapreduce.Job;  
import org.apache.hadoop.mapreduce.Mapper;  
import org.apache.hadoop.mapreduce.Reducer;  
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;  
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;  
import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;  
import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;  
import org.apache.hadoop.util.GenericOptionsParser;  
  
import mapreduce.SegmentUtil;  
  
public class JobControlDemo {  
    public static int main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {  
        Configuration conf = new Configuration();  
        String[] otherargs = new GenericOptionsParser(conf, args).getRemainingArgs();  
        if (otherargs.length != 3) {  
            System.err.println("Usage JobControlDemo <InputPath1> <InputPath1> <OutPath>");  
            System.exit(2);  
        }  
  
        // 创建基础作业  
        Job job1 = Job.getInstance(conf, JobControlDemo.class.getSimpleName() + "1");  
        Job job2 = Job.getInstance(conf, JobControlDemo.class.getSimpleName() + "2");  
        Job job3 = Job.getInstance(conf, JobControlDemo.class.getSimpleName() + "3");  
  
        // Job1作业参数配置  
        job1.setJarByClass(JobControlDemo.class);  
        job1.setMapOutputKeyClass(Text.class);  
        job1.setMapOutputValueClass(Text.class);  
        job1.setOutputKeyClass(Text.class);  
        job1.setOutputValueClass(Text.class);  
        job1.setMapperClass(MyMapper1.class);  
        job1.setReducerClass(MyReducer1.class);  
        job1.setInputFormatClass(TextInputFormat.class);  
        job1.setOutputFormatClass(TextOutputFormat.class);  
        FileInputFormat.addInputPath(job1, new Path(otherargs[0]));  
        FileOutputFormat.setOutputPath(job1, new Path(otherargs[2]+File.separator+"mid1"));  
  
        // Job2作业参数配置  
        job2.setJarByClass(JobControlDemo.class);  
        job2.setMapOutputKeyClass(Text.class);  
        job2.setMapOutputValueClass(Text.class);  
        job2.setOutputKeyClass(Text.class);  
        job2.setOutputValueClass(Text.class);  
        job2.setMapperClass(MyMapper2.class);  
        job2.setReducerClass(MyReducer2.class);  
        job2.setInputFormatClass(TextInputFormat.class);  
        job2.setOutputFormatClass(TextOutputFormat.class);  
        FileInputFormat.addInputPath(job2, new Path(otherargs[1]));  
        FileOutputFormat.setOutputPath(job2, new Path(otherargs[2]+File.separator+"mid2"));  
  
        // Job3作业参数配置  
        job3.setJarByClass(JobControlDemo.class);  
        job3.setMapOutputKeyClass(Text.class);  
        job3.setMapOutputValueClass(Text.class);  
        job3.setOutputKeyClass(Text.class);  
        job3.setOutputValueClass(Text.class);  
        job3.setMapperClass(MyMapper3.class);  
        job3.setReducerClass(MyReducer3.class);  
        job3.setInputFormatClass(KeyValueTextInputFormat.class);  
        job3.setOutputFormatClass(TextOutputFormat.class);  
        FileInputFormat.addInputPath(job3, new Path(otherargs[2]+File.separator+"mid1"));  
        FileInputFormat.addInputPath(job3, new Path(otherargs[2]+File.separator+"mid2"));  
        FileOutputFormat.setOutputPath(job3, new Path(otherargs[2]+File.separator+"result"));  
  
        // 创建受控作业  
        ControlledJob cjob1 = new ControlledJob(conf);  
        ControlledJob cjob2 = new ControlledJob(conf);  
        ControlledJob cjob3 = new ControlledJob(conf);  
  
        // 将普通作业包装成受控作业  
        cjob1.setJob(job1);  
        cjob2.setJob(job2);  
        cjob3.setJob(job3);  
  
        // 设置依赖关系  
        //cjob2.addDependingJob(cjob1);  
        cjob3.addDependingJob(cjob1);  
        cjob3.addDependingJob(cjob2);  
  
        // 新建作业控制器  
        JobControl jc = new JobControl("My control job");  
  
        // 将受控作业添加到控制器中  
        jc.addJob(cjob1);  
        jc.addJob(cjob2);  
        jc.addJob(cjob3);  
  
        /** 
         * hadoop的JobControl类实现了线程Runnable接口。我们需要实例化一个线程来让它启动。直接调用JobControl的run()方法，线程将无法结束。 
         */  
        //jc.run();  
          
        Thread jcThread = new Thread(jc);    
        jcThread.start();    
        while(true){    
            if(jc.allFinished()){    
                System.out.println(jc.getSuccessfulJobList());    
                jc.stop();    
                return 0;    
            }    
            if(jc.getFailedJobList().size() > 0){    
                System.out.println(jc.getFailedJobList());    
                jc.stop();    
                return 1;    
            }    
        }   
    }  
    /** 
     * 第一个Job 
     */  
    public static class MyMapper1 extends Mapper<LongWritable, Text, Text, Text>{  
        @Override  
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)  
                throws IOException, InterruptedException {  
            String[] spl1=value.toString().split("\t");  
            if(spl1.length==2){  
                context.write(new Text(spl1[0].trim()), new Text(spl1[1].trim()));  
            }  
        }  
    }  
    public static class MyReducer1 extends Reducer<Text, Text, Text, Text>{  
        @Override  
        protected void reduce(Text k2, Iterable<Text> v2s, Reducer<Text, Text, Text, Text>.Context context)  
                throws IOException, InterruptedException {  
            for (Text v2 : v2s) {  
                context.write(k2, v2);  
            }  
        }  
    }  
    /** 
     * 第二个Job 
     */  
    public static class MyMapper2 extends Mapper<LongWritable, Text, Text, Text>{  
        @Override  
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context)  
                throws IOException, InterruptedException {  
            String[] spl2=value.toString().split("\t");  
            if(spl2.length==2){  
                context.write(new Text(spl2[0].trim()), new Text(spl2[1].trim()));  
            }  
        }  
    }  
    public static class MyReducer2 extends Reducer<Text, Text, Text, Text>{  
        @Override  
        protected void reduce(Text k3, Iterable<Text> v3s, Reducer<Text, Text, Text, Text>.Context context)  
                throws IOException, InterruptedException {  
            for (Text v3 : v3s) {  
                context.write(k3, v3);  
            }  
        }  
    }  
    /** 
     * 第三个Job 
     */  
    public static class MyMapper3 extends Mapper<Text, Text, Text, Text>{  
        @Override  
        protected void map(Text key, Text value, Mapper<Text, Text, Text, Text>.Context context)  
                throws IOException, InterruptedException {  
            context.write(key, value);  
        }  
    }  
    public static class MyReducer3 extends Reducer<Text,Text, Text, Text>{  
        @Override  
        protected void reduce(Text k4, Iterable<Text> v4s,Reducer<Text, Text, Text, Text>.Context context)   
                throws IOException, InterruptedException {  
            HashSet<String> hashSet=new HashSet<String>();  
            for (Text v4 : v4s) {  
                hashSet.add(v4.toString().trim());  
            }  
            if(hashSet.size()>=2){  
                context.write(k4, new Text("OK"));  
            }  
        }  
    }  
}  

测试输入数据：

 
     [java]  
     view plaincopy
hdfs dfs -text /user/jiuqian/libin/input/inputpath1.txt  
hadoop  a  
spark   a  
hive    a  
hbase   a  
tachyon a  
storm   a  
redis   a  

 
     [java]  
     view plaincopy
hdfs dfs -text /user/jiuqian/libin/input/inputpath2.txt  
hadoop  b  
spark   b  
kafka   b  
tachyon b  
oozie   b  
flume   b  
sqoop   b  
solr    b  

测试输出数据：

 
     [java]  
     view plaincopy
 hdfs dfs -text /user/jiuqian/libin/input/inputpathmerge2.txt/result/*  
hadoop  OK  
spark   OK  
tachyon OK  

运行输出信息：

 
     [java]  
     view plaincopy
[sshexec] cmd : bash -c 'source  /home/jiuqian/.bashrc; /home/hduser/hadoop/bin/hadoop jar  /home/jiuqian/blb/JobControlDemo.jar -D mapreduce.map.java.opts=-Xmx2048m -D mapreduce.input.fileinputformat.split.minsize=1 -Dmapreduce.input.fileinputformat.split.maxsize=512000000 -D mapred.linerecordreader.maxlength=32768 /user/jiuqian/libin/input/inputpath1.txt /user/jiuqian/libin/input/inputpath2.txt /user/jiuqian/libin/input/inputpathmerge2.txt'  
16/02/27 12:37:45 INFO client.RMProxy: Connecting to ResourceManager at sh-rslog1/192.168.1.2:8032  
16/02/27 12:37:46 INFO input.FileInputFormat: Total input paths to process : 1  
16/02/27 12:37:46 INFO mapreduce.JobSubmitter: number of splits:1  
16/02/27 12:37:46 INFO Configuration.deprecation: mapred.linerecordreader.maxlength is deprecated. Instead, use mapreduce.input.linerecordreader.line.maxlength  
16/02/27 12:37:47 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1446086163035_17037  
16/02/27 12:37:47 INFO impl.YarnClientImpl: Submitted application application_1446086163035_17037  
16/02/27 12:37:47 INFO mapreduce.Job: The url to track the job: http://sh-rslog1:8088/proxy/application_1446086163035_17037/  
16/02/27 12:37:47 INFO client.RMProxy: Connecting to ResourceManager at sh-rslog1/27.115.29.102:8032  
16/02/27 12:37:47 INFO input.FileInputFormat: Total input paths to process : 1  
16/02/27 12:37:47 INFO mapreduce.JobSubmitter: number of splits:1  
16/02/27 12:37:47 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1446086163035_17038  
16/02/27 12:37:47 INFO impl.YarnClientImpl: Submitted application application_1446086163035_17038  
16/02/27 12:37:47 INFO mapreduce.Job: The url to track the job: http://sh-rslog1:8088/proxy/application_1446086163035_17038/  
16/02/27 12:38:13 INFO client.RMProxy: Connecting to ResourceManager at sh-rslog1/27.115.29.102:8032  
16/02/27 12:38:13 INFO input.FileInputFormat: Total input paths to process : 2  
16/02/27 12:38:13 INFO mapreduce.JobSubmitter: number of splits:2  
16/02/27 12:38:13 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1446086163035_17039  
16/02/27 12:38:13 INFO impl.YarnClientImpl: Submitted application application_1446086163035_17039  
16/02/27 12:38:13 INFO mapreduce.Job: The url to track the job: http://sh-rslog1:8088/proxy/application_1446086163035_17039/  
[job name:  JobControlDemo1  
job id: My control job0  
job state:  SUCCESS  
job mapred id:  job_1446086163035_17037  
job message:    just initialized  
job has no depending job:     
, job name: JobControlDemo2  
job id: My control job1  
job state:  SUCCESS  
job mapred id:  job_1446086163035_17038  
job message:    just initialized  
job has no depending job:     
, job name: JobControlDemo3  
job id: My control job2  
job state:  SUCCESS  
job mapred id:  job_1446086163035_17039  
job message:    just initialized  
job has 2 dependeng jobs:  
     depending job 0:   JobControlDemo1  
     depending job 1:   JobControlDemo2  
]  
[INFO] Executed tasks  
[INFO] ------------------------------------------------------------------------  
[INFO] BUILD SUCCESS  
[INFO] ------------------------------------------------------------------------