hadoop api提供有ChainMapper和ChainReducer链式处理数据的接口,ChainMapper在一个map节点可以执行多个mapper逻辑,他们想unix管道一样,前一个map的输出作为后一个map的输入,知道最后一个map输出作为partition的输入,然后到reduce节点。而ChainReducer则不能从字面意义上理解为多个reduce连接起来处理数据,ChainReducer是在reducer处理完成后添加map逻辑,实现reduce函数处理完成后的其它数据处理逻辑。
ChainReducer的优点:更好利用reduce节点,方便实现reduce输出后的其它数据处理逻辑。
ChainMapper的优点:api上描述为更好利用io,不过多个map连接起来的逻辑不能在同一个map节点里面实现吗?不很确定。
实现了一个简单示例,使用ChainMapper和ChainReducer
package test.com.cn;
import java.io.IOException;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Cluster;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobPriority;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.chain.ChainMapper;
import org.apache.hadoop.mapreduce.lib.chain.ChainReducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class ChainMapperJob {
static class Mapper01 extends Mapper<LongWritable,Text,Text,Text>
{
Random randGenerator = new Random();
Text key = new Text();
public void map(LongWritable inKey, Text inValue, Context context) throws IOException, InterruptedException
{
int rand = randGenerator.nextInt(10000);
key.set(String.valueOf(rand));
context.write(key, inValue);
}
}
static class Mapper02 extends Mapper<Text,Text,Text,Text>
{
public void map(Text inKey, Text inValue, Context context) throws IOException, InterruptedException
{
int key = Integer.valueOf(inKey.toString());
if (key <= 2000)
return;
else
context.write(inKey, inValue);
}
}
static class Reducer01 extends Reducer<Text,Text,Text,Text>
{
public void reduce(Text key, Iterable<Text> values, Context context)throws IOException, InterruptedException
{
for(Text value: values)
context.write(key, value);
}
}
static class Mapper03 extends Mapper<Text,Text,Text,Text>
{
public void map(Text inKey, Text inValue, Context context)throws IOException, InterruptedException
{
int key = Integer.valueOf(inKey.toString());
if (key <= 5000)
return;
context.write(inKey, inValue);
}
}
/**
* @param args
* @throws IOException
* @throws InterruptedException
* @throws ClassNotFoundException
*/
@SuppressWarnings("static-access")
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// TODO Auto-generated method stub
Job job = Job.getInstance(new Cluster(new Configuration()));
@SuppressWarnings("rawtypes")
ChainMapper chainMapper = new ChainMapper() ;
Configuration map01Conf = new Configuration(false);
chainMapper.addMapper(job, Mapper01.class, LongWritable.class, Text.class, Text.class, Text.class, map01Conf);
Configuration map02Conf = new Configuration(false);
chainMapper.addMapper(job, Mapper02.class, Text.class, Text.class, Text.class, Text.class, map02Conf);
job.setJarByClass(ChainMapperJob.class);
job.setReducerClass(Reducer01.class);
job.setNumReduceTasks(1);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setPriority(JobPriority.HIGH);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true)? 0: 1);
/*
Job job = Job.getInstance(new Cluster(new Configuration()));
@SuppressWarnings("rawtypes")
ChainMapper chainMapper = new ChainMapper() ;
@SuppressWarnings("rawtypes")
ChainReducer chainReducer = new ChainReducer();
Configuration map01Conf = new Configuration(false);
chainMapper.addMapper(job, Mapper01.class, LongWritable.class, Text.class, Text.class, Text.class, map01Conf);
Configuration map02Conf = new Configuration(false);
chainMapper.addMapper(job, Mapper02.class, Text.class, Text.class, Text.class, Text.class, map02Conf);
Configuration reduce01Conf = new Configuration(false);
chainReducer.setReducer(job, Reducer01.class, Text.class, Text.class, Text.class, Text.class, reduce01Conf);
Configuration map03Conf = new Configuration(false);
chainReducer.addMapper(job, Mapper03.class, Text.class, Text.class, Text.class, Text.class, map03Conf);
job.setJarByClass(ChainMapperJob.class);
job.setNumReduceTasks(1);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.setPriority(JobPriority.HIGH);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true)? 0: 1);
*/
}
}