MapReduce——EquivalentJoin

最新推荐文章于 2024-06-04 15:07:48 发布

mmc2015

最新推荐文章于 2024-06-04 15:07:48 发布

阅读量831

点赞数 1

分类专栏：分布式与云计算文章标签： mapreduce EquivalentJoin 等值连接

本文链接：https://blog.csdn.net/mmc2015/article/details/48601417

版权

分布式与云计算专栏收录该内容

10 篇文章 0 订阅

订阅专栏

本文介绍如何使用MapReduce实现等值连接操作，通过标记value并在reduce阶段进行连接，具体步骤包括对输入数据的处理和最终结果的生成。示例展示了对两个文件数据的等值连接结果，并提及了另一种自定义partition函数的方法。

摘要由CSDN通过智能技术生成

数据：

两个文件分别4/5行

hadoop@ubuntu:/usr/local/hadoop$ hdfs dfs -cat input/*
a1 b1
a1 b2
a3 b3
a4 b4
a1 c1
a1 c2
a3 c3
a3 c4
a5 c5

结果：

a1 b2 c2
a1 b2 c1
a1 b1 c2
a1 b1 c1
a3 b3 c4
a3 b3 c3

代码：

思路很简单，在map端标记value，在reduce端join。

（另一种方式在map端标记key为keyword:docID，只不过需要自定义partition函数实现根据keyword来划分而不是keyword:docID，有时间实现）

package hadoop.examples;

import java.io.IOException;
import java.util.ArrayList;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class EquivalentJoin {
	
	public static class TokenizerMapper extends Mapper<Object, Text, Text, Text>
	{
		//最好在map方法外定义变量，以减少map计算时创建对象的个数
		private Text keyInfo=new Text();
		private Text valueInfo=new Text();
		private FileSplit split;
		private String[] temp=null;
		private String fileName=null;
		private String[] line=null;
		@Override
		public void map(Object key, Text value, Context context) throws IOException, InterruptedException 
		{
			//Hadoop默认的数据输入格式是TextInputFormat，  
            //文件被分为一系列以换行或者制表符结束的行，  
            //key是每一行的位置（偏移量，LongWritable类型），  
            //value是每一行的内容，Text类型
			split=(FileSplit) context.getInputSplit(); //获取<key value>对所属的FileSplit对象
			temp=split.getPath().toString().split("/");
			fileName=temp[temp.length-1];
			line=value.toString().split(" ");
			if(line.length!=2)
			{
				return;
			}
			keyInfo.set(line[0]);
			valueInfo.set(fileName+":"+line[1]);
			context.write(keyInfo, valueInfo);
		}
	}
	
	public static class JoinReducer extends Reducer<Text, Text, Text, Text>
	{
		private final static String fileName1="1.txt";
		private final static String fileName2="2.txt";
		private String[] valueInfo=null;
		private String fileName=null;
		public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException
		{
			ArrayList<String> table1=new ArrayList<String>(); //不能写到外面去
			ArrayList<String> table2=new ArrayList<String>();
			for(Text val:values)
			{
				valueInfo=val.toString().split(":");
				fileName=valueInfo[0];
				if(fileName.equals(fileName1))
				{
					table1.add(valueInfo[1]);
				}
				else if(fileName.equals(fileName2))
				{
					table2.add(valueInfo[1]);
				}
			}
			for(String tab1:table1)
			{
				for(String tab2:table2)
				{
					context.write(key, new Text(tab1+" "+tab2));
				}
			}
		}
	}
	
	public static void main(String[] args) throws Exception
	{
		Configuration conf=new Configuration();
		String[] otherArgs=new GenericOptionsParser(conf, args).getRemainingArgs();
		if(otherArgs.length!=2)
		{
			System.out.println("Usage: EquivalentJoin <in> <out>");
			System.exit(2);
		}
		Path inputPath=new Path(args[0]);
		Path outputPath=new Path(args[1]);
		outputPath.getFileSystem(conf).delete(outputPath, true);
		
		//The constructor Job(Configuration conf, String jobName) is deprecated
		//Job job=new Job(conf, "EquivalentJoin");
		Job job=Job.getInstance(conf, "EquivalentJoin");
		job.setJarByClass(EquivalentJoin.class);
		
		job.setMapperClass(TokenizerMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		
		//job.setCombinerClass(MyCombiner.class);
		
		job.setReducerClass(JoinReducer.class);		
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);		
		
		FileInputFormat.addInputPath(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);
		
		System.exit(job.waitForCompletion(true)? 0:1);
	}
}