[ Hadoop | MapReduce ] 使用 CompositeInputSplit 来提高Join效率

最新推荐文章于 2021-07-29 23:34:53 发布

gjwang1983

最新推荐文章于 2021-07-29 23:34:53 发布

阅读量915

点赞数

分类专栏：大数据 | Hadoop

本文链接：https://blog.csdn.net/gjwang1983/article/details/44804159

版权

大数据 | Hadoop 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

Map side join is the most efficient way. On Hadoop, between two large datasets, we can utilizeComposite Join to achieve this goal.

The Use Case

First use Identity Mapper and Identity Reducer to sort and partition two inputs, making both have same partition numbers.

use -Dmapred.reduce.tasks=2

$ hadoop jar MyCompositeJoin.jar net.dataeng.examples.IdentityDriver \
-Dmapred.job.queue.name=hdmi-others \
-Dmapred.reduce.tasks=2 \
~/testdata/inleft \
~/testdata/inleftout

$ hadoop fs -cat ~/testdata/inleftout/part-r-00000
key2	value2
$ hadoop fs -cat ~/testdata/inleftout/part-r-00001
key1	value1
key3	value3
key3	value003

Secondly, use composite join…

$ hadoop jar MyCompositeJoin.jar net.dataeng.examples.CompositeJoinTestDriver \
-Dmapred.job.queue.name=hdmi-others \
~/testdata/inleftout \
~/testdata/inrightout \
~/testdata/out \
inner

$ hadoop fs -cat ~/testdata/inrightout/part-r-00000
key2	value22
$ hadoop fs -cat ~/testdata/inrightout/part-r-00001
key3	value33
key3	value333
key5	value55

Note: if the two inputs have different partition numbers(i.e. part* files) , an exception will be thrown: java.io.IOException: Inconsistent split cardinality from child 1 (1/2)

The simplest way to use composite join is to make reduce number = 1, so that there is only one partition for each input file, provided the performance is fine.

The Source Code for the application

package net.dataeng.examples;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class IdentityDriver extends Configured implements Tool {

	@Override
	public Configuration getConf() {
		return super.getConf() == null ? new Configuration() : super.getConf();
	}

	@Override
	public int run(String[] args) throws Exception {

		if (args.length != 2) {
			System.out.printf(
					"Usage: %s [generic options]  \n",
					getClass().getSimpleName());
			ToolRunner.printGenericCommandUsage(System.out);
			return -1;
		}

		Job job = new Job(getConf(), this.getClass().getName());

		job.setJarByClass(IdentityDriver.class);

		Path inputPath = new Path(args[0]);
		Path outputPath = new Path(args[1]);

		FileInputFormat.addInputPath(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		//job.setNumReduceTasks(2);

		job.setMapperClass(Mapper.class);
		job.setReducerClass(Reducer.class);

		job.setInputFormatClass(KeyValueTextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);

		int exitCode = job.waitForCompletion(true) ? 0 : 1;

		return exitCode;

	}

	public static void main(String[] args) throws Exception {
		ToolRunner.run(new IdentityDriver(), args);
	}
}

package net.dataeng.examples;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.join.CompositeInputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class CompositeJoinTestDriver extends Configured implements Tool {

	@Override
	public int run(String[] args) throws Exception {

		if (args.length != 4) {
			System.out
					.printf("Usage: %s [generic options] 
   
    
    
     
     
     
      
       [inner|outer]\n",
							getClass().getSimpleName());
			ToolRunner.printGenericCommandUsage(System.out);
			return -1;
		}

		Path inputAPath = new Path(args[0]);
		Path inputBPath = new Path(args[1]);
		Path outputDir = new Path(args[2]);
		String joinType = args[3];

		if (!(joinType.equalsIgnoreCase("inner") || joinType
				.equalsIgnoreCase("outer"))) {
			System.err.println("Join type not set to inner or outer");
			System.exit(2);
		}

		JobConf conf = new JobConf(new Configuration(),
				CompositeJoinTestDriver.class);
		conf.setJobName(this.getClass().getName());
		conf.setJarByClass(this.getClass());

		conf.setMapperClass(CompositeJoinMapper.class);
		conf.setNumReduceTasks(0);

		conf.setInputFormat(CompositeInputFormat.class);
		conf.set("mapred.join.expr", CompositeInputFormat.compose(joinType,
				KeyValueTextInputFormat.class, inputAPath, inputBPath));
		TextOutputFormat.setOutputPath(conf, outputDir);

		conf.setMapOutputKeyClass(Text.class);
		conf.setMapOutputValueClass(Text.class);

		conf.setOutputKeyClass(Text.class);
		conf.setOutputValueClass(Text.class);

		RunningJob job = JobClient.runJob(conf);

		while (!job.isComplete()) {
			Thread.sleep(1000);
		}

		return job.isSuccessful() ? 0 : 2;
	}

	public static void main(String[] args) throws Exception {
		int exitCode = ToolRunner.run(new CompositeJoinTestDriver(), args);
		System.exit(exitCode);
	}
}

package net.dataeng.examples;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.join.TupleWritable;

public class CompositeJoinMapper extends MapReduceBase implements
		Mapper
   
   
    
     {

	@Override
	public void map(Text key, TupleWritable value,
			OutputCollector
    
    
     
      output, Reporter reporter)
			throws IOException {
		output.collect((Text) value.get(0), (Text) value.get(1));
	}
}

gjwang1983

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
[ Hadoop | MapReduce ] 使用 CompositeInputSplit 来提高Join效率

Map side join is the most efficient way. On Hadoop, between two large datasets, we can utilizeComposite Join to achieve this goal.
复制链接

扫一扫