在Web项目中,由用户下发任务后,后台服务器远程调用JobTracker所在服务器,运行Map/Reduce更符合B/S架构的习惯。
由于网上没有相关资料,所以自己实现了一个,现在分享一下。
注:基于Hadoop1.1.2版本
转发请注明地址:http://sgq0085.iteye.com/admin/blogs/1879442
一个常见的WordCount如下:
- package com.gqshao.hadoop.remote;
- import java.io.IOException;
- import java.util.*;
- import org.apache.hadoop.conf.*;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.*;
- import org.apache.hadoop.mapreduce.*;
- import org.apache.hadoop.mapreduce.lib.input.*;
- import org.apache.hadoop.mapreduce.lib.output.*;
- import org.apache.hadoop.util.*;
- public class WordCount extends Configured implements Tool {
- public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
- private final static IntWritable one = new IntWritable(1);
- private Text word = new Text();
- public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
- String line = value.toString();
- StringTokenizer tokenizer = new StringTokenizer(line);
- while (tokenizer.hasMoreTokens()) {
- word.set(tokenizer.nextToken());
- context.write(word, one);
- }
- }
- }
- public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
- public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
- int sum = 0;
- for (IntWritable val : values) {
- sum += val.get();
- }
- context.write(key, new IntWritable(sum));
- }
- }
- public int run(String[] args) throws Exception {
- this.getClass().getResource("/hadoop/");
- Configuration conf = getConf();
- Job job = new Job(conf);
- conf.set("mapred.job.tracker", "192.168.0.128:9001");
- conf.set("fs.default.name", "hdfs://192.168.0.128:9000");
- conf.set("hadoop.job.ugi", "hadoop");
- conf.set("Hadoop.tmp.dir", "/user/gqshao/temp/");
- job.setJarByClass(WordCount.class);
- job.setJobName("wordcount");
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(IntWritable.class);
- job.setMapperClass(Map.class);
- job.setReducerClass(Reduce.class);
- job.setInputFormatClass(TextInputFormat.class);
- job.setOutputFormatClass(TextOutputFormat.class);
- String hdfs = "hdfs://192.168.0.128:9000";
- args = new String[] { hdfs + "/user/gqshao/input/big", hdfs + "/user/gqshao/output/WordCount/" + new Date().getTime() };
- FileInputFormat.setInputPaths(job, new Path(args[0]));
- FileOutputFormat.setOutputPath(job, new Path(args[1]));
- boolean success = job.waitForCompletion(true);
- return success ? 0 : 1;
- }
- public static void main(String[] args) throws Exception {
- int ret = ToolRunner.run(new WordCount(), args);
- System.exit(ret);
- }
- }
- 信息: Running job: job_local_0001
证明该Map/Reduce程序运行在Local中。也就是说,这种方式只能提前打好Jar包,放到Cluster服务器上,在通过Jar运行。
转发请注明地址:http://sgq0085.iteye.com/admin/blogs/1879442
如何远程运行Map/Reduce程序,经研究发现两点。
1.需要将Hadoop的配置文件加载到当前进程的ClassLoader中,或将配置文件放到/bin目录下。
通过跟踪 job.waitForCompletion(true);→submit();→info = jobClient.submitJobInternal(conf);→status = jobSubmitClient.submitJob(jobId, submitJobDir.toString(), jobCopy.getCredentials());
发现private JobSubmissionProtocol jobSubmitClient;分别有两个实现
在org.apache.hadoop.mapred.JobClient中init()方法中可以看到如果设置了conf中如果设置了mapred.job.tracker则在Hadoop Cluster中运行,否则是Local
- public void init(JobConf conf) throws IOException {
- String tracker = conf.get("mapred.job.tracker", "local");
- tasklogtimeout = conf.getInt(
- TASKLOG_PULL_TIMEOUT_KEY, DEFAULT_TASKLOG_TIMEOUT);
- this.ugi = UserGroupInformation.getCurrentUser();
- if ("local".equals(tracker)) {
- conf.setNumMapTasks(1);
- this.jobSubmitClient = new LocalJobRunner(conf);
- } else {
- this.rpcJobSubmitClient =
- createRPCProxy(JobTracker.getAddress(conf), conf);
- this.jobSubmitClient = createProxy(this.rpcJobSubmitClient, conf);
- }
- }
所以需要在运行时加载某目录下配置文件
方法如下:
- /**
- * 加载配置文件
- */
- public static void setConf(Class<?> clazz, Thread thread, String path) {
- URL url = clazz.getResource(path);
- try {
- File confDir = new File(url.toURI());
- if (!confDir.exists()) {
- return;
- }
- URL key = confDir.getCanonicalFile().toURI().toURL();
- ClassLoader classLoader = thread.getContextClassLoader();
- classLoader = new URLClassLoader(new URL[] { key }, classLoader);
- thread.setContextClassLoader(classLoader);
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
2.设置运行时Jar包
继续看jobClient.submitJobInternal(conf);可以发现client在提交作业到Hadoop时需要把作业打包成jar,然后copy到fs的submitJarFile路径中。所以必须指定conf中的运行的Jar包。
方法如下:
- /**
- * 动态生成Jar包
- */
- public static File createJar(Class<?> clazz) throws Exception {
- String fqn = clazz.getName();
- String base = fqn.substring(0, fqn.lastIndexOf("."));
- base = "/" + base.replaceAll("\\.", Matcher.quoteReplacement("/"));
- URL root = clazz.getResource("");
- JarOutputStream out = null;
- final File jar = File.createTempFile("HadoopRunningJar-", ".jar", new File(System.getProperty("java.io.tmpdir")));
- System.out.println(jar.getAbsolutePath());
- Runtime.getRuntime().addShutdownHook(new Thread() {
- public void run() {
- jar.delete();
- }
- });
- try {
- File path = new File(root.toURI());
- Manifest manifest = new Manifest();
- manifest.getMainAttributes().putValue("Manifest-Version", "1.0");
- manifest.getMainAttributes().putValue("Created-By", "RemoteHadoopUtil");
- out = new JarOutputStream(new FileOutputStream(jar), manifest);
- writeBaseFile(out, path, base);
- } finally {
- out.flush();
- out.close();
- }
- return jar;
- }
- /**
- * 递归添加.class文件
- */
- private static void writeBaseFile(JarOutputStream out, File file, String base) throws IOException {
- if (file.isDirectory()) {
- File[] fl = file.listFiles();
- if (base.length() > 0) {
- base = base + "/";
- }
- for (int i = 0; i < fl.length; i++) {
- writeBaseFile(out, fl[i], base + fl[i].getName());
- }
- } else {
- out.putNextEntry(new JarEntry(base));
- FileInputStream in = null;
- try {
- in = new FileInputStream(file);
- byte[] buffer = new byte[1024];
- int n = in.read(buffer);
- while (n != -1) {
- out.write(buffer, 0, n);
- n = in.read(buffer);
- }
- } finally {
- in.close();
- }
- }
- }
修改后的WordCount如下:
- public class WordCount extends Configured implements Tool {
- public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
- private final static IntWritable one = new IntWritable(1);
- private Text word = new Text();
- public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
- String line = value.toString();
- System.out.println("line===>" + line);
- StringTokenizer tokenizer = new StringTokenizer(line);
- while (tokenizer.hasMoreTokens()) {
- word.set(tokenizer.nextToken());
- context.write(word, one);
- }
- }
- }
- public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
- public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
- int sum = 0;
- for (IntWritable val : values) {
- sum += val.get();
- }
- context.write(key, new IntWritable(sum));
- }
- }
- public int run(String[] args) throws Exception {
- Configuration conf = getConf();
- Job job = new Job(conf);
- System.out.println(conf.get("mapred.job.tracker"));
- System.out.println(conf.get("fs.default.name"));
- /**
- * TODO:调用二
- */
- File jarFile = RemoteHadoopUtil.createJar(WordCount.class);
- ((JobConf) job.getConfiguration()).setJar(jarFile.toString());
- job.setJarByClass(WordCount.class);
- job.setJobName("wordcount");
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(IntWritable.class);
- job.setMapperClass(Map.class);
- job.setReducerClass(Reduce.class);
- job.setInputFormatClass(TextInputFormat.class);
- job.setOutputFormatClass(TextOutputFormat.class);
- String hdfs = "hdfs://192.168.0.128:9000";
- args = new String[] { hdfs + "/user/gqshao/input/WordCount/", hdfs + "/user/gqshao/output/WordCount/" + new Date().getTime() };
- FileInputFormat.setInputPaths(job, new Path(args[0]));
- FileOutputFormat.setOutputPath(job, new Path(args[1]));
- boolean success = job.waitForCompletion(true);
- System.out.println(job.isComplete());
- System.out.println("JobID: " + job.getJobID());
- return success ? 0 : 1;
- }
- public static void main(String[] args) throws Exception {
- /**
- * TODO:调用一
- */
- RemoteHadoopUtil.setConf(WordCount.class, Thread.currentThread(), "/hadoop");
- int ret = ToolRunner.run(new WordCount(), args);
- System.exit(ret);
- }
- }
- package com.missionsky.hadoop.remote;
- /**
- * For hadoop 2.2.0
- */
- import java.io.File;
- import java.io.IOException;
- import java.util.Date;
- import java.util.StringTokenizer;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.conf.Configured;
- import org.apache.hadoop.fs.Path;
- import org.apache.hadoop.io.IntWritable;
- import org.apache.hadoop.io.LongWritable;
- import org.apache.hadoop.io.Text;
- import org.apache.hadoop.mapred.JobConf;
- import org.apache.hadoop.mapreduce.Job;
- import org.apache.hadoop.mapreduce.Mapper;
- import org.apache.hadoop.mapreduce.Reducer;
- import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
- import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
- import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
- import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
- import org.apache.hadoop.util.Tool;
- import org.apache.hadoop.util.ToolRunner;
- import com.missionsky.hadoop.remote.utils.RemoteHadoopUtil;
- public class WordCount extends Configured implements Tool {
- public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {
- private final static IntWritable one = new IntWritable(1);
- private Text word = new Text();
- public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
- String line = value.toString();
- System.out.println("line===>" + line);
- StringTokenizer tokenizer = new StringTokenizer(line);
- while (tokenizer.hasMoreTokens()) {
- word.set(tokenizer.nextToken());
- context.write(word, one);
- }
- }
- }
- public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {
- public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
- int sum = 0;
- for (IntWritable val : values) {
- sum += val.get();
- }
- context.write(key, new IntWritable(sum));
- }
- }
- public int run(String[] args) throws Exception {
- Job job = Job.getInstance();
- job.setJobName("job_wordcount");
- // Create Jar
- File jarFile = RemoteHadoopUtil.createJar(WordCount.class);
- job.setJar(jarFile.toString());
- job.setJarByClass(WordCount.class);
- job.setOutputKeyClass(Text.class);
- job.setOutputValueClass(IntWritable.class);
- job.setMapperClass(Map.class);
- job.setReducerClass(Reduce.class);
- job.setInputFormatClass(TextInputFormat.class);
- job.setOutputFormatClass(TextOutputFormat.class);
- String hdfs = "hdfs://192.168.0.109:9000";
- FileInputFormat.setInputPaths(job, new Path(hdfs + "/user/input/wordcount/"));
- FileOutputFormat.setOutputPath(job, new Path(hdfs + "/user/output/wordcount/" + new Date().getTime()));
- boolean success = job.waitForCompletion(true);
- System.out.println("Job Final Status:" + job.getStatus().getState());
- return success ? 0 : 1;
- }
- public static void main(String[] args) throws Exception {
- Configuration configuration = new Configuration();
- int ret = ToolRunner.run(configuration,new WordCount(), args);
- System.exit(ret);
- }
- }