远程调用执行Hadoop Map/Reduce

最新推荐文章于 2021-11-25 20:46:53 发布

wangjinyang_123

最新推荐文章于 2021-11-25 20:46:53 发布

阅读量816

点赞数

分类专栏： hadoop 文章标签：远程调用MapReduce程序

hadoop 专栏收录该内容

15 篇文章 0 订阅

订阅专栏

在Web项目中，由用户下发任务后，后台服务器远程调用JobTracker所在服务器，运行Map/Reduce更符合B/S架构的习惯。

由于网上没有相关资料，所以自己实现了一个，现在分享一下。

注：基于Hadoop1.1.2版本

转发请注明地址：http://sgq0085.iteye.com/admin/blogs/1879442

一个常见的WordCount如下：

    Java代码   
    
  
 package com.gqshao.hadoop.remote;  
   
 import java.io.IOException;  
 import java.util.*;  
   
 import org.apache.hadoop.conf.*;  
 import org.apache.hadoop.fs.Path;  
 import org.apache.hadoop.io.*;  
 import org.apache.hadoop.mapreduce.*;  
 import org.apache.hadoop.mapreduce.lib.input.*;  
 import org.apache.hadoop.mapreduce.lib.output.*;  
 import org.apache.hadoop.util.*;  
   
 public class WordCount extends Configured implements Tool {  
     public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {  
         private final static IntWritable one = new IntWritable(1);  
         private Text word = new Text();  
   
         public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {  
             String line = value.toString();  
             StringTokenizer tokenizer = new StringTokenizer(line);  
             while (tokenizer.hasMoreTokens()) {  
                 word.set(tokenizer.nextToken());  
                 context.write(word, one);  
             }  
         }  
     }  
   
     public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {  
         public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {  
             int sum = 0;  
             for (IntWritable val : values) {  
                 sum += val.get();  
             }  
             context.write(key, new IntWritable(sum));  
         }  
     }  
   
     public int run(String[] args) throws Exception {  
         this.getClass().getResource("/hadoop/");  
         Configuration conf = getConf();  
         Job job = new Job(conf);  
         conf.set("mapred.job.tracker", "192.168.0.128:9001");  
         conf.set("fs.default.name", "hdfs://192.168.0.128:9000");  
         conf.set("hadoop.job.ugi", "hadoop");  
         conf.set("Hadoop.tmp.dir", "/user/gqshao/temp/");  
   
         job.setJarByClass(WordCount.class);  
         job.setJobName("wordcount");  
   
         job.setOutputKeyClass(Text.class);  
         job.setOutputValueClass(IntWritable.class);  
   
         job.setMapperClass(Map.class);  
         job.setReducerClass(Reduce.class);  
   
         job.setInputFormatClass(TextInputFormat.class);  
         job.setOutputFormatClass(TextOutputFormat.class);  
         String hdfs = "hdfs://192.168.0.128:9000";  
         args = new String[] { hdfs + "/user/gqshao/input/big", hdfs + "/user/gqshao/output/WordCount/" + new Date().getTime() };  
         FileInputFormat.setInputPaths(job, new Path(args[0]));  
         FileOutputFormat.setOutputPath(job, new Path(args[1]));  
         boolean success = job.waitForCompletion(true);  
         return success ? 0 : 1;  
     }  
   
     public static void main(String[] args) throws Exception {  
         int ret = ToolRunner.run(new WordCount(), args);  
         System.exit(ret);  
     }  
 }  

在这里输入和输出目录都是指向HDFS上的，但实际运行的时候(一般 -Xms128m -Xmx512m -XX:MaxPermSize=128M)发现输出中有如下信息：

    Java代码   
    
 信息: Running job: job_local_0001

证明该Map/Reduce程序运行在Local中。也就是说，这种方式只能提前打好Jar包，放到Cluster服务器上，在通过Jar运行。

转发请注明地址：http://sgq0085.iteye.com/admin/blogs/1879442

如何远程运行Map/Reduce程序，经研究发现两点。

1.需要将Hadoop的配置文件加载到当前进程的ClassLoader中，或将配置文件放到/bin目录下。

通过跟踪 job.waitForCompletion(true);→submit();→info = jobClient.submitJobInternal(conf);→status = jobSubmitClient.submitJob(jobId, submitJobDir.toString(), jobCopy.getCredentials());

发现private JobSubmissionProtocol jobSubmitClient;分别有两个实现

在org.apache.hadoop.mapred.JobClient中init()方法中可以看到如果设置了conf中如果设置了mapred.job.tracker则在Hadoop Cluster中运行，否则是Local

    Java代码   
    
  
 public void init(JobConf conf) throws IOException {  
   String tracker = conf.get("mapred.job.tracker", "local");  
   tasklogtimeout = conf.getInt(  
     TASKLOG_PULL_TIMEOUT_KEY, DEFAULT_TASKLOG_TIMEOUT);  
   this.ugi = UserGroupInformation.getCurrentUser();  
   if ("local".equals(tracker)) {  
     conf.setNumMapTasks(1);  
     this.jobSubmitClient = new LocalJobRunner(conf);  
   } else {  
     this.rpcJobSubmitClient =   
         createRPCProxy(JobTracker.getAddress(conf), conf);  
     this.jobSubmitClient = createProxy(this.rpcJobSubmitClient, conf);  
   }          
 }  

所以需要在运行时加载某目录下配置文件

方法如下：

    Java代码   
    
  
 /** 
  * 加载配置文件 
  */  
 public static void setConf(Class<?> clazz, Thread thread, String path) {  
     URL url = clazz.getResource(path);  
     try {  
         File confDir = new File(url.toURI());  
         if (!confDir.exists()) {  
             return;  
         }  
         URL key = confDir.getCanonicalFile().toURI().toURL();  
         ClassLoader classLoader = thread.getContextClassLoader();  
         classLoader = new URLClassLoader(new URL[] { key }, classLoader);  
         thread.setContextClassLoader(classLoader);  
     } catch (Exception e) {  
         e.printStackTrace();  
     }  
 }  

2.设置运行时Jar包

继续看jobClient.submitJobInternal(conf);可以发现client在提交作业到Hadoop时需要把作业打包成jar，然后copy到fs的submitJarFile路径中。所以必须指定conf中的运行的Jar包。

方法如下：

    Java代码   
    
  
 /** 
  * 动态生成Jar包 
  */  
 public static File createJar(Class<?> clazz) throws Exception {  
     String fqn = clazz.getName();  
     String base = fqn.substring(0, fqn.lastIndexOf("."));  
     base = "/" + base.replaceAll("\\.", Matcher.quoteReplacement("/"));  
     URL root = clazz.getResource("");  
   
     JarOutputStream out = null;  
     final File jar = File.createTempFile("HadoopRunningJar-", ".jar", new File(System.getProperty("java.io.tmpdir")));  
     System.out.println(jar.getAbsolutePath());  
     Runtime.getRuntime().addShutdownHook(new Thread() {  
         public void run() {  
             jar.delete();  
         }  
     });  
     try {  
         File path = new File(root.toURI());  
         Manifest manifest = new Manifest();  
         manifest.getMainAttributes().putValue("Manifest-Version", "1.0");  
         manifest.getMainAttributes().putValue("Created-By", "RemoteHadoopUtil");  
         out = new JarOutputStream(new FileOutputStream(jar), manifest);  
         writeBaseFile(out, path, base);  
     } finally {  
         out.flush();  
         out.close();  
     }  
     return jar;  
 }  
   
 /** 
  * 递归添加.class文件 
  */  
 private static void writeBaseFile(JarOutputStream out, File file, String base) throws IOException {  
     if (file.isDirectory()) {  
         File[] fl = file.listFiles();  
         if (base.length() > 0) {  
             base = base + "/";  
         }  
         for (int i = 0; i < fl.length; i++) {  
             writeBaseFile(out, fl[i], base + fl[i].getName());  
         }  
     } else {  
         out.putNextEntry(new JarEntry(base));  
         FileInputStream in = null;  
         try {  
             in = new FileInputStream(file);  
             byte[] buffer = new byte[1024];  
             int n = in.read(buffer);  
             while (n != -1) {  
                 out.write(buffer, 0, n);  
                 n = in.read(buffer);  
             }  
         } finally {  
             in.close();  
         }    
     }  
 }  

修改后的WordCount如下：

    Java代码   
    
  
 public class WordCount extends Configured implements Tool {  
     public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {  
         private final static IntWritable one = new IntWritable(1);  
         private Text word = new Text();  
   
         public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {  
             String line = value.toString();  
             System.out.println("line===>" + line);  
             StringTokenizer tokenizer = new StringTokenizer(line);  
             while (tokenizer.hasMoreTokens()) {  
                 word.set(tokenizer.nextToken());  
                 context.write(word, one);  
             }  
         }  
     }  
   
     public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {  
         public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {  
             int sum = 0;  
             for (IntWritable val : values) {  
                 sum += val.get();  
             }  
             context.write(key, new IntWritable(sum));  
         }  
     }  
   
     public int run(String[] args) throws Exception {  
         Configuration conf = getConf();  
         Job job = new Job(conf);  
         System.out.println(conf.get("mapred.job.tracker"));  
         System.out.println(conf.get("fs.default.name"));  
         /** 
          * TODO:调用二 
          */  
         File jarFile = RemoteHadoopUtil.createJar(WordCount.class);  
         ((JobConf) job.getConfiguration()).setJar(jarFile.toString());  
         job.setJarByClass(WordCount.class);  
         job.setJobName("wordcount");  
   
         job.setOutputKeyClass(Text.class);  
         job.setOutputValueClass(IntWritable.class);  
   
         job.setMapperClass(Map.class);  
         job.setReducerClass(Reduce.class);  
   
         job.setInputFormatClass(TextInputFormat.class);  
         job.setOutputFormatClass(TextOutputFormat.class);  
         String hdfs = "hdfs://192.168.0.128:9000";  
         args = new String[] { hdfs + "/user/gqshao/input/WordCount/", hdfs + "/user/gqshao/output/WordCount/" + new Date().getTime() };  
         FileInputFormat.setInputPaths(job, new Path(args[0]));  
         FileOutputFormat.setOutputPath(job, new Path(args[1]));  
         boolean success = job.waitForCompletion(true);  
         System.out.println(job.isComplete());  
         System.out.println("JobID: " + job.getJobID());  
         return success ? 0 : 1;  
     }  
   
     public static void main(String[] args) throws Exception {  
         /** 
          * TODO:调用一 
          */  
         RemoteHadoopUtil.setConf(WordCount.class, Thread.currentThread(), "/hadoop");  
         int ret = ToolRunner.run(new WordCount(), args);  
         System.exit(ret);  
     }  
 }  
 
  
  
 
 
  新API 
 

     Java代码   
     
   
 package com.missionsky.hadoop.remote;  
   
 /** 
  * For hadoop 2.2.0 
  */  
 import java.io.File;  
 import java.io.IOException;  
 import java.util.Date;  
 import java.util.StringTokenizer;  
   
 import org.apache.hadoop.conf.Configuration;  
 import org.apache.hadoop.conf.Configured;  
 import org.apache.hadoop.fs.Path;  
 import org.apache.hadoop.io.IntWritable;  
 import org.apache.hadoop.io.LongWritable;  
 import org.apache.hadoop.io.Text;  
 import org.apache.hadoop.mapred.JobConf;  
 import org.apache.hadoop.mapreduce.Job;  
 import org.apache.hadoop.mapreduce.Mapper;  
 import org.apache.hadoop.mapreduce.Reducer;  
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
 import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;  
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;  
 import org.apache.hadoop.util.Tool;  
 import org.apache.hadoop.util.ToolRunner;  
   
 import com.missionsky.hadoop.remote.utils.RemoteHadoopUtil;  
   
 public class WordCount extends Configured implements Tool {  
     public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {  
         private final static IntWritable one = new IntWritable(1);  
         private Text word = new Text();  
   
         public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {  
             String line = value.toString();  
             System.out.println("line===>" + line);  
             StringTokenizer tokenizer = new StringTokenizer(line);  
             while (tokenizer.hasMoreTokens()) {  
                 word.set(tokenizer.nextToken());  
                 context.write(word, one);  
             }  
         }  
     }  
   
     public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {  
         public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {  
             int sum = 0;  
             for (IntWritable val : values) {  
                 sum += val.get();  
             }  
             context.write(key, new IntWritable(sum));  
         }  
     }  
   
     public int run(String[] args) throws Exception {  
         Job job = Job.getInstance();  
           
         job.setJobName("job_wordcount");  
           
         // Create Jar  
         File jarFile = RemoteHadoopUtil.createJar(WordCount.class);  
         job.setJar(jarFile.toString());  
           
         job.setJarByClass(WordCount.class);  
   
         job.setOutputKeyClass(Text.class);  
         job.setOutputValueClass(IntWritable.class);  
   
         job.setMapperClass(Map.class);  
         job.setReducerClass(Reduce.class);  
   
         job.setInputFormatClass(TextInputFormat.class);  
         job.setOutputFormatClass(TextOutputFormat.class);  
           
         String hdfs = "hdfs://192.168.0.109:9000";  
         FileInputFormat.setInputPaths(job, new Path(hdfs + "/user/input/wordcount/"));  
         FileOutputFormat.setOutputPath(job, new Path(hdfs + "/user/output/wordcount/" + new Date().getTime()));  
         boolean success = job.waitForCompletion(true);  
           
         System.out.println("Job Final Status:" + job.getStatus().getState());  
           
         return success ? 0 : 1;  
     }  
   
     public static void main(String[] args) throws Exception {  
         Configuration configuration = new Configuration();  
         int ret = ToolRunner.run(configuration,new WordCount(), args);  
         System.exit(ret);  
     }  
 }