前言
在做需求时,经常遇到多个目录,也就是多个维度进行join,这里分析一下,数据是怎么流动的。
1、多目录输入
使用MultipleInputs.addInputPath() 对多目录制定格式和map
2、数据流分析
map按行读入数据,需要对不同的输入目录,打上不同的标记(这个方法又叫reduce端连接),map在输出后会进行partition和sort,按照key进行排序,然后输出到reduce进行处理。
例子
三个输入文件:
a.txt:
500
501
b.txt:
500 501
600 505
c.txt:
501 500
700 800
代码
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import util.TextPair;
import com.sina.hadoop.MultipleInputs;
public class Main extends Configured implements Tool
{
public static void main(String[] args) throws Exception
{
int exitcode = ToolRunner.run(new Main(), args);
System.exit(exitcode);
}
/**
* 分区
*/
static class TextPairKeyPartitioner extends Partitioner<TextPair, Text>
{
public int getPartition(TextPair key, Text value, int numPartitions)
{
return (key.getFirst().hashCode() & Integer.MAX_VALUE) % numPartitions;
}
}
public int run(String[] arg0) throws Exception
{
int exitcode = 0;
if (exitcode == 0)
{
Job job1 = new Job();
job1.setJobName("testMultipleInputs");
job1.setJarByClass(Main.class);
MultipleInputs.addInputPath(job1, new Path("xx/testMultipleInputs/input/a/"),
TextInputFormat.class, AMapper.class);
MultipleInputs.addInputPath(job1, new Path("xx/testMultipleInputs/input/b/"),
TextInputFormat.class, BMapper.class);
MultipleInputs.addInputPath(job1, new Path("xx/testMultipleInputs/input/c/"),
TextInputFormat.class, CMapper.class);
job1.setReducerClass(TestReducer.class);
FileOutputFormat.setOutputPath(job1, new Path("xx/testMultipleInputs/output/"));
job1.setOutputKeyClass(Text.class);
job1.setOutputValueClass(Text.class);
job1.setPartitionerClass(TextPairKeyPartitioner.class);
job1.setGroupingComparatorClass(TextPair.FirstComparator.class);
job1.setMapOutputKeyClass(TextPair.class);
job1.setMapOutputValueClass(Text.class);
job1.setNumReduceTasks(1);
exitcode = job1.waitForCompletion(true) ? 0 : 1;
}
return exitcode;
}
public class AMapper extends Mapper<LongWritable, Text, TextPair, Text>
{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String[] data = value.toString().split("\t", -1);
String id = "";
if (data.length >= 1)
{
id = data[0];
if (!"".equals(id))
{
context.write(new TextPair(id, "1"), new Text("0"));
}
}
}
}
public class BMapper extends Mapper<LongWritable, Text, TextPair, Text>
{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String[] data = value.toString().split("\t", -1);
String id1 = "";
String id2 = "";
if (data.length >= 2)
{
id1 = data[0];
id2 = data[1];
if (!"".equals(id1))
{
context.write(new TextPair(id1, "2"), new Text(id2));
}
}
}
}
public class CMapper extends Mapper<LongWritable, Text, TextPair, Text>
{
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
String[] data = value.toString().split("\t", -1);
String id1 = "";
String id2 = "";
if (data.length >= 2)
{
id1 = data[0];
id2 = data[1];
if (!"".equals(id1))
{
context.write(new TextPair(id1, "3"), new Text(id2));
}
}
}
}
public class TestReducer extends Reducer<TextPair, Text, Text, Text>
{
public void reduce(TextPair key, Iterable<Text> values, Context context) throws IOException, InterruptedException
{
String data = "";
Iterator<Text> i = values.iterator();
while (i.hasNext())
{
data = i.next().toString();
context.write(key.getFirst(), new Text(data));
}
}
}
}