Map side join is the most efficient way. On Hadoop, between two large datasets, we can utilizeComposite Join to achieve this goal.
The Use Case
First use Identity Mapper and Identity Reducer to sort and partition two inputs, making both have same partition numbers.
use -Dmapred.reduce.tasks=2
$ hadoop jar MyCompositeJoin.jar net.dataeng.examples.IdentityDriver \
-Dmapred.job.queue.name=hdmi-others \
-Dmapred.reduce.tasks=2 \
~/testdata/inleft \
~/testdata/inleftout
$ hadoop fs -cat ~/testdata/inleftout/part-r-00000
key2 value2
$ hadoop fs -cat ~/testdata/inleftout/part-r-00001
key1 value1
key3 value3
key3 value003
Secondly, use composite join…
$ hadoop jar MyCompositeJoin.jar net.dataeng.examples.CompositeJoinTestDriver \
-Dmapred.job.queue.name=hdmi-others \
~/testdata/inleftout \
~/testdata/inrightout \
~/testdata/out \
inner
$ hadoop fs -cat ~/testdata/inrightout/part-r-00000
key2 value22
$ hadoop fs -cat ~/testdata/inrightout/part-r-00001
key3 value33
key3 value333
key5 value55
Note: if the two inputs have different partition numbers(i.e. part* files) , an exception will be thrown: java.io.IOException: Inconsistent split cardinality from child 1 (1/2)
The simplest way to use composite join is to make reduce number = 1, so that there is only one partition for each input file, provided the performance is fine.
The Source Code for the application
package net.dataeng.examples;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class IdentityDriver extends Configured implements Tool {
@Override
public Configuration getConf() {
return super.getConf() == null ? new Configuration() : super.getConf();
}
@Override
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.out.printf(
"Usage: %s [generic options]
package net.dataeng.examples;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.join.CompositeInputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class CompositeJoinTestDriver extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
if (args.length != 4) {
System.out
.printf("Usage: %s [generic options]
[inner|outer]\n",
getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.out);
return -1;
}
Path inputAPath = new Path(args[0]);
Path inputBPath = new Path(args[1]);
Path outputDir = new Path(args[2]);
String joinType = args[3];
if (!(joinType.equalsIgnoreCase("inner") || joinType
.equalsIgnoreCase("outer"))) {
System.err.println("Join type not set to inner or outer");
System.exit(2);
}
JobConf conf = new JobConf(new Configuration(),
CompositeJoinTestDriver.class);
conf.setJobName(this.getClass().getName());
conf.setJarByClass(this.getClass());
conf.setMapperClass(CompositeJoinMapper.class);
conf.setNumReduceTasks(0);
conf.setInputFormat(CompositeInputFormat.class);
conf.set("mapred.join.expr", CompositeInputFormat.compose(joinType,
KeyValueTextInputFormat.class, inputAPath, inputBPath));
TextOutputFormat.setOutputPath(conf, outputDir);
conf.setMapOutputKeyClass(Text.class);
conf.setMapOutputValueClass(Text.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(Text.class);
RunningJob job = JobClient.runJob(conf);
while (!job.isComplete()) {
Thread.sleep(1000);
}
return job.isSuccessful() ? 0 : 2;
}
public static void main(String[] args) throws Exception {
int exitCode = ToolRunner.run(new CompositeJoinTestDriver(), args);
System.exit(exitCode);
}
}
package net.dataeng.examples;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.join.TupleWritable;
public class CompositeJoinMapper extends MapReduceBase implements
Mapper
{
@Override
public void map(Text key, TupleWritable value,
OutputCollector
output, Reporter reporter)
throws IOException {
output.collect((Text) value.get(0), (Text) value.get(1));
}
}