数据:
两个文件分别4/5行
hadoop@ubuntu:/usr/local/hadoop$ hdfs dfs -cat input/*
a1 b1
a1 b2
a3 b3
a4 b4
a1 c1
a1 c2
a3 c3
a3 c4
a5 c5
结果:
a1 b2 c2
a1 b2 c1
a1 b1 c2
a1 b1 c1
a3 b3 c4
a3 b3 c3
代码:
思路很简单,在map端标记value,在reduce端join。
(另一种方式在map端标记key为keyword:docID,只不过需要自定义partition函数实现根据keyword来划分而不是keyword:docID,有时间实现)
package hadoop.examples;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class EquivalentJoin {
public static class TokenizerMapper extends Mapper<Object, Text, Text, Text>
{
//最好在map方法外定义变量,以减少map计算时创建对象的个数
private Text keyInfo=new Text();
private Text valueInfo=new Text();
private FileSplit split;
private String[] temp=null;
private String fileName=null;
private String[] line=null;
@Override
public void map(Object key, Text value, Context context) throws IOException, InterruptedException
{
//Hadoop默认的数据输入格式是TextInputFormat,
//文件被分为一系列以换行或者制表符结束的行,
//key是每一行的位置(偏移量,LongWritable类型),
//value是每一行的内容,Text类型
split=(FileSplit) context.getInputSplit(); //获取<key value>对所属的FileSplit对象
temp=split.getPath().toString().split("/");
fileName=temp[temp.length-1];
line=value.toString().split(" ");
if(line.length!=2)
{
return;
}
keyInfo.set(line[0]);
valueInfo.set(fileName+":"+line[1]);
context.write(keyInfo, valueInfo);
}
}
public static class JoinReducer extends Reducer<Text, Text, Text, Text>
{
private final static String fileName1="1.txt";
private final static String fileName2="2.txt";
private String[] valueInfo=null;
private String fileName=null;
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException
{
ArrayList<String> table1=new ArrayList<String>(); //不能写到外面去
ArrayList<String> table2=new ArrayList<String>();
for(Text val:values)
{
valueInfo=val.toString().split(":");
fileName=valueInfo[0];
if(fileName.equals(fileName1))
{
table1.add(valueInfo[1]);
}
else if(fileName.equals(fileName2))
{
table2.add(valueInfo[1]);
}
}
for(String tab1:table1)
{
for(String tab2:table2)
{
context.write(key, new Text(tab1+" "+tab2));
}
}
}
}
public static void main(String[] args) throws Exception
{
Configuration conf=new Configuration();
String[] otherArgs=new GenericOptionsParser(conf, args).getRemainingArgs();
if(otherArgs.length!=2)
{
System.out.println("Usage: EquivalentJoin <in> <out>");
System.exit(2);
}
Path inputPath=new Path(args[0]);
Path outputPath=new Path(args[1]);
outputPath.getFileSystem(conf).delete(outputPath, true);
//The constructor Job(Configuration conf, String jobName) is deprecated
//Job job=new Job(conf, "EquivalentJoin");
Job job=Job.getInstance(conf, "EquivalentJoin");
job.setJarByClass(EquivalentJoin.class);
job.setMapperClass(TokenizerMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//job.setCombinerClass(MyCombiner.class);
job.setReducerClass(JoinReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
System.exit(job.waitForCompletion(true)? 0:1);
}
}