由于书中给出的代码阅读性比较差..就自己稍微修改了下..
此处是child-parent表,要求输出grandchild-grandparent表
input file:
child parent
Tom Lucy
Tom Jack
Jone Lucy
Jone Jack
Lucy Mary
Lucy Ben
Jack Alice
Jack Jesse
Terry Alice
Terry Jesse
Philip Terry
Philip Alma
Mark Terry
Mark Alma
package me.river.study.hadoop.mr;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class STjoin {
private static boolean isFirst = true;
public static class STMapper extends Mapper<LongWritable, Text, Text, Text> {
private Text outKey = new Text();
private Text outValue = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] line = value.toString().split("\\s");
if (!"child".equals(line[0])) {
String relationtype = null;
String childname = line[0];
String parentname = line[1];
relationtype = "1"; // 左表 key的子女信息
outKey.set(parentname);
outValue.set(relationtype + "\t" + childname);
context.write(outKey, outValue);
relationtype = "2"; // 右表 key的父母信息
outKey.set(childname);
outValue.set(relationtype + "\t" + parentname);
context.write(outKey, outValue);
}
}
}
public static class STReducer extends Reducer<Text, Text, Text, Text> {
private Text outKey = new Text();
private Text outValue = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
// 表头
if (isFirst) {
outKey.set("grandchild");
outValue.set("grandparent");
context.write(outKey, outValue);
isFirst = false;
}
// 封装key的儿女(子孙)与父母(祖父母)
List<String> grandchildren = new ArrayList<String>();
List<String> grandparents = new ArrayList<String>();
for (Text value : values) {
String[] line = value.toString().split("\t");
if ("1".equals(line[0])) {
grandchildren.add(line[1]);
} else {
grandparents.add(line[1]);
}
}
// 输出笛卡尔积--子孙与祖父母
if (!grandchildren.isEmpty() && !grandparents.isEmpty()) {
for (String grandchild : grandchildren) {
for (String grandparent : grandparents) {
outKey.set(grandchild);
outValue.set(grandparent);
context.write(outKey, outValue);
}
}
}
}
}
public static void main(String[] args) throws Exception {
Job job = Job.getInstance(new Configuration(), "single table join");
job.setJarByClass(STjoin.class);
job.setMapperClass(STMapper.class);
job.setReducerClass(STReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
输出结果:
grandchild grandparent
Tom Alice
Tom Jesse
Jone Alice
Jone Jesse
Tom Ben
Tom Mary
Jone Ben
Jone Mary
Philip Alice
Philip Jesse
Mark Alice
Mark Jesse