主要参考:http://www.it165.net/admin/html/201410/3821.html
进行修改
在开始学习hadoop时,最痛苦的一件事就是难以理解所写程序的执行过程,让我们先来看这个实例,这个测试类ToolRunnerTest继承Configured的基础上实现了Tool接口,下面对其用到的基类源码进行分析,就可以理解其执行过程是如此简单。。。。。。
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class ToolRunnerTest extends Configured implements Tool {
@Override
public int run(String[] arg0) throws Exception {
//调用基类Configured的getConf获取环境变量实例
Configuration conf=getConf();
//获取属性值
System.out.println("flower is" + conf.get("flower"));
return 0;
}
public static void main(String[] args) throws Exception {
//获取当前环境变量
Configuration conf=new Configuration();
//使用ToolRunner的run方法对自定义的类型进行处理
ToolRunner.run(conf, new ToolRunnerTest(), args);
}
}
基类Configured实现了Configurable接口,而Configurable接口源码如下
1 Public interface Configurable{
2 Void setConf(Configuration conf);
3 Configuration getConf();
4 }
Configured则必须实现Configurable类的两个方法,源码如下
/**
*Configured类包含getConf()方法,经常被使用
*/
Public class Configured implements Configurable{
Private Configuration conf;
Public Configured(Configuration conf){
setConf(conf);
}
Public void setConf(Configuration conf){
This.conf=conf;
}
Public getConf(){
Return conf;
}
}
Tool的源代码
public interface Tool extends Configurable {
int run(String [] args) throws Exception;
}
public class ToolRunner {
public static int run(Configuration conf,
Tool tool, String[] args) throws Exception{
if(conf == null) {
conf = new Configuration();
}
GenericOptionsParser parser = new
GenericOptionsParser(conf, args);
//set the configuration back, so that Tool can
//configure itself
tool.setConf(conf)
String[] toolArgs = parser.getRemainingArgs();
return tool.run(toolArgs);
}
//调用上面的三参数run函数
public static int run(Tool tool, String[] args)
throws Exception{
return run(tool.getConf(), tool, args);
}
public static void
printGenericCommandUsage(PrintStream out) {
GenericOptionsParser.printGenericCommandUsage(out);
}
}
解析:当程序执行ToolRunner.run(conf, new ToolRunnerTest(), args);时,会转到ToolRunner类的run方法部分,因为Configuration已经实例,所以直至执行到tool.run(toolArgs);又因为Tool是一个只含有一个run方法框架的接口,所以将执行实现这个接口的类ToolRunnerTest的run方法。完成其输出。其实在看完这几个类的源码后,其执行过程是很简单的。
下面是一个最简单的MapReduce 程序,综合应用辅助函数:
package hadoop.ch047.practice;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
//import org.apache.hadoop.mapred.FileInputFormat;//必须使用下面那个,使用这个会出错
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;//必须带lib的
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
//最简单的MapRedcue程序,没有是使用Mapper和Reducer
public class TestMapReduce extends Configured implements Tool{
//Main 函数中的ToolRunner.run将调用此函数
public int run(String[] args) throws Exception {
if (args.length<2) {
System.out.printf("Usage:%s <input> <output>\n",
this.getClass().getSimpleName());
return -1;
}
Configuration conf=getConf();
Job job=new Job(conf);
//以下三种方法均可以job.setJarByClass
//job.setJarByClass(this.getClass());
job.setJarByClass(getClass());
//job.setJarByClass(TestMapReduce.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.out.print("use the TestMapReduce.run\n");
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
int exitCode=ToolRunner.run(conf, new TestMapReduce(), args);
System.out.println("job is finished!");
System.exit(exitCode);
//查看输出结果
//hadoop fs -text part-00000|head
//文本输出可以用cat
}
}