做项目的时候碰到一个需要用链式mapreduce的问题,然后调研了一下ChainMapper和ChainReducer的使用,顺带缕一下新旧API的异同。
首先需要说明的是:从0.20.0开始,hadoop的API发生了改变,但是旧API依然保留,包名:
新版API包名:org.apache.hadoop.mapred
org.apache.hadoop.mapreduce
所有hadoop已发布版本的具体文档可以查看这里。
2.新旧版API的异同
以map/reduce定义为例
2.1 map
类名定义
//新版
public static class MyMapper extends Mapper<Object, Text, Text, Text>{}
//旧版
public static class MyMapper extends MapReduceBase implements Mapper<Object, Text, Text, Text>{}
涉及包名
//新版
import org.apache.hadoop.mapreduce.Mapper
//旧版
import org.apache.hadoop.mapred.MapReduceBase
import org.apache.hadoop.mapred.Mapper
方法定义
新版
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {}
旧版
public void map(Object key, Text value, OutputCollector<Text, Text> output, Reporter reporter)throws IOException{}
涉及包名
//新版
import java.io.IOException
//旧版
import java.io.IOException import org.apache.hadoop.mapred.OutputCollector import org.apache.hadoop.mapred.Reporter
2.2 reduce
类名定义
//新版
public static MyReducer extends Reducer<Text,Text,Text,Text> {}
//旧版
public static class MyReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text>{}
涉及包名
//新版
import org.apache.hadoop.mapreduce.Reducer//旧版
import org.apache.hadoop.mapred.Reducer
方法定义
//新版
public void reduce(Text key, Iterable<Text> values,Context context) throws IOException, InterruptedException {}
//旧版
public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException{}
涉及包名
//新版
与map相同
//旧版
比map多一个包
import java.util.Iterator;
2.3 main方法
新版
Configuration conf = new Configuration();
Job job = new Job(conf, "mapreduce"); job.setJarByClass(MyMapReduce.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(path1)); FileOutputFormat.setOutputPath(job, new Path(path2)); job.waitForCompletion(true) ;
旧版
JobConf conf = new JobConf(MyMapReduce.class);
conf.setJobName("mapreduce");
conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class);
conf.setMapperClass(MyMapper.class);conf.setCombinerClass(MyReducer.class); conf.setReducerClass(MyReducer.class);
conf.setInputFormat(TextInputFormat.class);conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(path1));FileOutputFormat.setOutputPath(conf, new Path(path2));
JobClient.runJob(conf);
即新版API中用Job代替了旧版中的JobConf,同时对一些方法也进行了重新定义,具体看下面的包对比
涉及包名
新版
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
旧版
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.TextOutputFormat;
3.链式MapReduce
ChainMapper和ChainReducer在旧版包中,链式mapreduce只允许有一个reduce,但是可以有多个map,包括reduce之前和之后,我在实现时链如下:map->reduce->map
package org.apache.hadoop.examples; import java.io.IOException; import java.util.StringTokenizer; import java.util.Iterator; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.lib.ChainMapper; import org.apache.hadoop.mapred.lib.ChainReducer; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.TextInputFormat; import org.apache.hadoop.mapred.TextOutputFormat; public class GraphPartition{ public static class LabelCompareMapper extends MapReduceBase implements Mapper<Object, Text, Text, Text>{ public void map(Object key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException{ String line = value.toString(); if(line.substring(0,1).matches("[0-9]{1}")) { String[] values = line.split("\t"); String[] heads = values[0].split(" "); if(values[1].contains("_")) { values[1] = values[1].replace("_",""); String[] tails = values[1].split(" "); String src_ver = heads[0]; String label = heads[1]; for(int i=0;i<tails.length;i++) { output.collect(new Text(tails[i]),new Text(src_ver+"_"+label)); //<2 1_1> } } output.collect(new Text(heads[0]),new Text(heads[1])); //<2 2> } } } public static class LabelCompareReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text>{ private Text result = new Text(); public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException{ String head = ""; String tail = ""; while(values.hasNext()) { String val = values.next().toString(); if(val.contains("_")) { tail = tail+val+" "; } else { head = val; } } if(tail.contains("_")) { result.set(head+" "+tail); output.collect(key,result); //<2 2 1_1> } } } public static class GraphPartitionMapper extends MapReduceBase implements Mapper<Text, Text,Text,Text>{ public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException{ String ver = key.toString(); String tail = value.toString(); String[] sps = tail.split(" "); String ver_lab = sps[0]; for(int i=0;i<sps.length;i++) { if(sps[i].length()>=1) { if(sps[i].contains("_")) { String[] blocks = sps[i].split("_"); if(blocks[1]==ver_lab) { output.collect(new Text(blocks[0]+" "+ver+" "+blocks[1]),new Text()); } else { output.collect(new Text(blocks[0]+" "+ver+" "+ver_lab),new Text()); } } else { ver_lab = sps[i]; } } } } } public static void main(String[] args) throws Exception{ String path1="lbp/input"; String path2="lbp/out1"; JobConf job = new JobConf(GraphPartition.class); job.setJobName("ChainJob"); job.setInputFormat(TextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); JobConf labelcomparemapperconf = new JobConf(false); ChainMapper.addMapper(job,LabelCompareMapper.class,Object.class,Text.class,Text.class,Text.class, true,labelcomparemapperconf); JobConf labelcomparereducerconf = new JobConf(false); ChainReducer.setReducer(job,LabelCompareReducer.class,Text.class,Text.class, Text.class,Text.class,true,labelcomparereducerconf); JobConf graphpartitionmapperconf = new JobConf(false); ChainReducer.addMapper(job,GraphPartitionMapper.class,Text.class,Text.class,Text.class,Text.class,true,graphpartitionmapperconf); job.setJarByClass(GraphPartition.class); job.setNumReduceTasks(1); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job, new Path(path1)); FileOutputFormat.setOutputPath(job, new Path(path2)); JobClient.runJob(job); } }