package com.hadoop.tablerelation;
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
/**
* 单表关联
* */
public class STjoin {
public static int time=0;
//map将输入分隔成child和parent,然后正序输出一次作为父亲表,反序输出一次作为孩子表。
//在输出的value中加上父亲表和孩子表的区别标志。
public static class Map extends Mapper<Object,Text,Text,Text>{
public void map(Object key,Text value,Context context) throws IOException, InterruptedException{
String childname = new String();
String parentname = new String();
String relationtype = new String();
String line = value.toString();
int i = 0;
while (line.charAt(i)!=' ') {
i++;
}
String[] values={line.substring(0, i),line.substring(i+1)};
if (values[0].compareTo("child") != 0) {
childname = values[0];
parentname = values[1];
relationtype = "1";//父亲表和孩子表区分标志
//父亲表
context.write(new Text(values[1]),
new Text(relationtype+"+"+childname+"+"+parentname));
//孩子表
relationtype = "2";
context.write(new Text(values[0]),
new Text(relationtype+"+"+childname+"+"+parentname));
}
}
}
public static class Reduce extends Reducer<Text,Text,Text,Text>{
public void reduce(Text key,Iterable<Text> values,Context context) throws IOException, InterruptedException{
//输出表头
if(time==0){
context.write(new Text("grandchild"), new Text("grandparent"));
time++;
}
int grandchildnum = 0;
String grandchild[] = new String[10];
int grandparentnum = 0;
String grandparent[] = new String[10];
Iterator<Text> ite = values.iterator();
System.out.println("开始");//测试用
while (ite.hasNext()) {
String record = ite.next().toString();
System.out.println(key.toString()+" "+record);//测试用
int len = record.length();
int i = 2;
if (len == 0) continue;
char relationtype = record.charAt(0);
String childname = new String();
String parentname = new String();
//获取value-list中value得child
while (record.charAt(i) != '+') {
childname = childname + record.charAt(i);
i++;
}
i = i+1;
//获取value-list中value得parent
while (i<len) {
parentname = parentname + record.charAt(i);
i++;
}
//父亲表取出child放入grandchild
if(relationtype == '1'){
grandchild[grandchildnum] = childname;
grandchildnum++;
} else {//孩子表取出parent放入grandparent
grandparent[grandparentnum] = parentname;
grandparentnum++;
}
}
//grandchild和grandparent数组求笛卡尔积
if (grandparentnum != 0 && grandchildnum != 0) {
System.out.println("----笛卡尔积-----");//测试用
for (int m=0;m<grandchildnum;m++) {
for (int n=0;n<grandparentnum;n++) {
context.write(new Text(grandchild[m]), new Text(grandparent[n]));
}
}
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException{
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf,args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
Job job = new Job(conf,"single table join");
job.setJarByClass(STjoin.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
数据:
child parent
Tom Lucy
Tom Jack
Jone Lucy
Jone Jack
Lucy Mary
Lucy Ben
Jack Alice
Jack Jesse
Terry Alice
Terry Jesse
Philip Terry
Philip Alma
Mark Terry
Mark Alma
(1)Map阶段:
map将输入分隔成child和parent,然后正序输出一次作为父亲表,反序输出一次作为孩子表。
父亲表:
Lucy[1+Tom+Lucy,1+Jone+Lucy]
Jack[1+Tom+Jack,1+Jone+Jack]
Mary[1+Lucy+Mary]
Ben[1+Lucy+Ben]
......
孩子表:
Tom[2+Tom+Lucy,2+Tom+Jack]
Jone[2+Jone+Lucy,2+Jone+Jack]
Lucy[2+Lucy+Mary,2+Lucy+Ben]
Jack[2+Jack+Alice,2+Jack+Jesse]
.......
以红色数据为例,因为Map会把相同key的数据合并,所以Map的最终输出数据会这样。Jack{1+Tom+Jack,1+Jone+Jack,2+Jack+Alice,2+Jack+Jesse}
Tom和Jone的父亲是Jack,Alice和Jesse的孩子是Jack。
(2)Reduce阶段(※数据是一条一条的读取的,一条数据处理完后,再读取另一条数据)
取得父亲表的孩子(前缀为1的数据)Tom和Jone放入孙子数组。取得孩子表的父亲(前缀为2)的数据放入爷爷数组。
并当两个数组的数据都不为空时,reduce才输出数据。
程序中测试代码(system.out)的输出数据如下
开始
Alice 1+Terry+Alice
Alice 1+Jack+Alice
开始
Alma 1+Mark+Alma
Alma 1+Philip+Alma
开始
Ben 1+Lucy+Ben
开始
Jack 2+Jack+Alice
Jack 1+Tom+Jack
Jack 1+Jone+Jack
Jack 2+Jack+Jesse
----笛卡尔积-----
所以可知reduce阶段的数据是一条一条的读取并处理的。
(3)最终输出结果如下
grandchild grandparent
Tom Alice
Tom Jesse
Jone Alice
Jone Jesse
Tom Ben
Tom Mary
Jone Ben
Jone Mary
Philip Alice
Philip Jesse
Mark Alice
Mark Jesse
※PS:Map阶段的最终输出条数与context.write()的个数无关系。最终还是根据key值合并数据