[ Hadoop | MapReduce ] 使用 CompositeInputSplit 来提高Join效率

Map side join is the most efficient way. On Hadoop, between two large datasets, we can utilizeComposite Join to achieve this goal. 


The Use Case

First use Identity Mapper and Identity Reducer to sort and partition two inputs, making both have same partition numbers.

use -Dmapred.reduce.tasks=2

$ hadoop jar MyCompositeJoin.jar net.dataeng.examples.IdentityDriver \
-Dmapred.job.queue.name=hdmi-others \
-Dmapred.reduce.tasks=2 \
~/testdata/inleft \
~/testdata/inleftout

$ hadoop fs -cat ~/testdata/inleftout/part-r-00000
key2	value2
$ hadoop fs -cat ~/testdata/inleftout/part-r-00001
key1	value1
key3	value3
key3	value003

Secondly, use composite join…

$ hadoop jar MyCompositeJoin.jar net.dataeng.examples.CompositeJoinTestDriver \
-Dmapred.job.queue.name=hdmi-others \
~/testdata/inleftout \
~/testdata/inrightout \
~/testdata/out \
inner

$ hadoop fs -cat ~/testdata/inrightout/part-r-00000
key2	value22
$ hadoop fs -cat ~/testdata/inrightout/part-r-00001
key3	value33
key3	value333
key5	value55

Note: if the two inputs have different partition numbers(i.e. part* files) , an exception will be thrown: java.io.IOException: Inconsistent split cardinality from child 1 (1/2)

The simplest way to use composite join is to make reduce number = 1, so that there is only one partition for each input file, provided the performance is fine.

The Source Code for the application

package net.dataeng.examples;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class IdentityDriver extends Configured implements Tool {

	@Override
	public Configuration getConf() {
		return super.getConf() == null ? new Configuration() : super.getConf();
	}

	@Override
	public int run(String[] args) throws Exception {

		if (args.length != 2) {
			System.out.printf(
					"Usage: %s [generic options]  \n",
					getClass().getSimpleName());
			ToolRunner.printGenericCommandUsage(System.out);
			return -1;
		}

		Job job = new Job(getConf(), this.getClass().getName());

		job.setJarByClass(IdentityDriver.class);

		Path inputPath = new Path(args[0]);
		Path outputPath = new Path(args[1]);

		FileInputFormat.addInputPath(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		//job.setNumReduceTasks(2);

		job.setMapperClass(Mapper.class);
		job.setReducerClass(Reducer.class);

		job.setInputFormatClass(KeyValueTextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);

		int exitCode = job.waitForCompletion(true) ? 0 : 1;

		return exitCode;

	}

	public static void main(String[] args) throws Exception {
		ToolRunner.run(new IdentityDriver(), args);
	}
}

package net.dataeng.examples;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.KeyValueTextInputFormat;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.join.CompositeInputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class CompositeJoinTestDriver extends Configured implements Tool {

	@Override
	public int run(String[] args) throws Exception {

		if (args.length != 4) {
			System.out
					.printf("Usage: %s [generic options] 
   
    
    
     
     
     
      
       [inner|outer]\n",
							getClass().getSimpleName());
			ToolRunner.printGenericCommandUsage(System.out);
			return -1;
		}

		Path inputAPath = new Path(args[0]);
		Path inputBPath = new Path(args[1]);
		Path outputDir = new Path(args[2]);
		String joinType = args[3];

		if (!(joinType.equalsIgnoreCase("inner") || joinType
				.equalsIgnoreCase("outer"))) {
			System.err.println("Join type not set to inner or outer");
			System.exit(2);
		}

		JobConf conf = new JobConf(new Configuration(),
				CompositeJoinTestDriver.class);
		conf.setJobName(this.getClass().getName());
		conf.setJarByClass(this.getClass());

		conf.setMapperClass(CompositeJoinMapper.class);
		conf.setNumReduceTasks(0);

		conf.setInputFormat(CompositeInputFormat.class);
		conf.set("mapred.join.expr", CompositeInputFormat.compose(joinType,
				KeyValueTextInputFormat.class, inputAPath, inputBPath));
		TextOutputFormat.setOutputPath(conf, outputDir);

		conf.setMapOutputKeyClass(Text.class);
		conf.setMapOutputValueClass(Text.class);

		conf.setOutputKeyClass(Text.class);
		conf.setOutputValueClass(Text.class);

		RunningJob job = JobClient.runJob(conf);

		while (!job.isComplete()) {
			Thread.sleep(1000);
		}

		return job.isSuccessful() ? 0 : 2;
	}

	public static void main(String[] args) throws Exception {
		int exitCode = ToolRunner.run(new CompositeJoinTestDriver(), args);
		System.exit(exitCode);
	}
}

     
     
    
    
   
   

package net.dataeng.examples;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.join.TupleWritable;

public class CompositeJoinMapper extends MapReduceBase implements
		Mapper
   
   
    
     {

	@Override
	public void map(Text key, TupleWritable value,
			OutputCollector
    
    
     
      output, Reporter reporter)
			throws IOException {
		output.collect((Text) value.get(0), (Text) value.get(1));
	}
}

    
    
   
   

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值