目前使用的hadoop是新架构,API与旧版本的也不一样。新版的API是在org.apache.hadoop.mapreduce,旧版API是在org.apache.hadoop.mapred中。新版API不兼容旧版API。
主要改变:
1、Mapper和Reducer不再是接口,而是抽象类,且Map函数和Reduce函数不再实现Mapper和Reducer接口,而是集成Mapper和Reducer抽象类,这样做更容易扩展。
2、更广泛的使用了context对象,并使用MapContext进行MapReduce间的通信,MapContext同时充当了OutPutCollector和Reporter角色
3、Job的配置统一由Configuration完成,不必额外的使用JobConf类对守护进程进行配置
4、由Job类负责对Job的控制,而不是JobClient,JobClient在新的API中已经被删除。
下面的具体的解释,也比较简单。
public class WordCounter {
//内部类TokenizerMapper,继承了Mapper抽象类
public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
//map方法,将数据读入后切出单词,并标记它的数目为1,形成<word,1>的形式
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
System.out.println("-----------------test--------------");
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
//IntSumReducer内部类,继承Reducer抽象类
public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
//reduce方法,将相同key值的value收集起来,形成<key.list of value>也就是<word,list of 1>的形式,再将这些1加起来,就得出了word的个数
public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
//main方法,将Reduce生成的<key,value>以TextOutPutFormat的形式输出到HDFS中
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//下面两行必须要配置,否则会出现LocalFileSystem将DistributedFileSystem覆盖的情况
conf.set("fs.hdfs.impl",org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl",org.apache.hadoop.fs.LocalFileSystem.class.getName());
Job job = new Job(conf, "word count");
job.setJarByClass(WordCounter.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class); //Output是以<key,value的形式输出,所以有setOutputValueClass和setOutputValueClass>
job.setOutputValueClass(IntWritable.class);
/*FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));*/ //可以用命令后面接参数的形式,也可以指定输入输出的目录
FileInputFormat.addInputPath(job, new Path("/input/test"));
FileOutputFormat.setOutputPath(job, new Path("/out"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
主要改变:
1、Mapper和Reducer不再是接口,而是抽象类,且Map函数和Reduce函数不再实现Mapper和Reducer接口,而是集成Mapper和Reducer抽象类,这样做更容易扩展。
2、更广泛的使用了context对象,并使用MapContext进行MapReduce间的通信,MapContext同时充当了OutPutCollector和Reporter角色
3、Job的配置统一由Configuration完成,不必额外的使用JobConf类对守护进程进行配置
4、由Job类负责对Job的控制,而不是JobClient,JobClient在新的API中已经被删除。
下面的具体的解释,也比较简单。
public class WordCounter {
//内部类TokenizerMapper,继承了Mapper抽象类
public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
//map方法,将数据读入后切出单词,并标记它的数目为1,形成<word,1>的形式
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
System.out.println("-----------------test--------------");
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
//IntSumReducer内部类,继承Reducer抽象类
public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
//reduce方法,将相同key值的value收集起来,形成<key.list of value>也就是<word,list of 1>的形式,再将这些1加起来,就得出了word的个数
public void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
//main方法,将Reduce生成的<key,value>以TextOutPutFormat的形式输出到HDFS中
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
//下面两行必须要配置,否则会出现LocalFileSystem将DistributedFileSystem覆盖的情况
conf.set("fs.hdfs.impl",org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl",org.apache.hadoop.fs.LocalFileSystem.class.getName());
Job job = new Job(conf, "word count");
job.setJarByClass(WordCounter.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class); //Output是以<key,value的形式输出,所以有setOutputValueClass和setOutputValueClass>
job.setOutputValueClass(IntWritable.class);
/*FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));*/ //可以用命令后面接参数的形式,也可以指定输入输出的目录
FileInputFormat.addInputPath(job, new Path("/input/test"));
FileOutputFormat.setOutputPath(job, new Path("/out"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}