在单表关联和多表关联的应用的时候,需要从数据里面挖掘出信息来进行操作。
例子给出 孩子和父母的表 输出孩子和爷爷的表 数据如下:
child | parent |
tom | jerem |
tom | lucy |
jerem | xd |
lucy | hongton |
jack | jc |
terry | jc |
jc | cy |
tom | xd |
tom | hongton |
jack | cy |
terry | cy |
首先在map阶段将输入数据分割成child和parent之后,需要将parent设为key,child设为value作为左表,而在另一组的child和parent中需要将parent 作为value child作为key,此表作为右表。为了区分左右表,需要将在输出value的时候 附带上左右的区别信息,字符1代表左表 字符2代表右表。map阶段就完成了左表 和右表 ,在shuffle阶段完成链接,reduce接收链接结果,其中key对应的value-list 包含了 对应的关系,取出value值进行分析,将child放入数组1 parent放入数组2 然后进行数组的笛卡尔积得出最后结果。
代码样式入下:
package reverseIndex;
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class singleJoin {
public static int time = 0; // 用于表头的输出
public static class Map extends Mapper<LongWritable, Text,Text,Text>{
public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException{
String childName = new String();
String parentName = new String();
String relationType = new String(); // the flag of left_table right_table
//handle line data
StringTokenizer itr = new StringTokenizer(value.toString());
String[] values = new String[2];
int i = 0;
while(itr.hasMoreTokens()){
values[i] = itr.nextToken();
i++;
}
//ignore the title
if(values[0].compareTo("child")!=0){
childName = values[0];
parentName = values[1];
//output the left_table
relationType = "1";
context.write(new Text(values[1]), new Text(relationType+"+"
+childName+"+"+parentName));
//output the right_table
relationType = "2";
context.write(new Text(values[0]),new Text(relationType+"+"
+childName+"+"+parentName));
}
}
}
public static class Reduce extends Reducer<Text,Text,Text,Text>{
public void reduce(Text key,Iterable<Text>value,Context context) throws IOException, InterruptedException{
if(0 == time){
//输出表头
context.write(new Text("grandchild"),new Text("grandparent"));
time++;
}
int grandchildnum = 0;
int grandparentnum = 0;
//定义存储grandchild,grandparent的数组
String[] grandchild = new String[20];
String[] grandparent = new String[20];
Iterator<Text> it = value.iterator();
while(it.hasNext()){
String record = it.next().toString();
int len = record.length();
int i=2;
if(0==len){
continue;
}
char relationType = record.charAt(0);
String childName = new String();
String parentName = new String();
while(record.charAt(i)!='+'){
childName += record.charAt(i);
i++;
}
i += 1;
while(i<len){
parentName += record.charAt(i);
i++;
}
if('1'==relationType){
grandchild[grandchildnum] = childName;
grandchildnum++;
}
if('2'==relationType){
grandparent[grandparentnum] = parentName;
grandparentnum++;
}
}//end while 结束一行数据
if(0!=grandchildnum && 0!=grandparentnum){
for(int m=0;m<grandchildnum;m++){
for(int n=0;n<grandparentnum;n++){
context.write(new Text(grandchild[m]),new Text(grandparent[m]));
}
}
}
}//end reduce
}//end Reduce
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// TODO Auto-generated method stub
Configuration conf = new Configuration();
Job job = new Job(conf,"singleJoin");
job.setJarByClass(singleJoin.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true)?0:1);
}
}
多表链接:输入2个文件 ,一个学生表 包含学生名 和课程编号,另一个是教师名和课程编号 ,输出学生相对应的教师。
1 | cy |
2 | ld |
3 | cy |
4 | ld |
xd | 1 |
xyy | 2 |
lf | 3 |
zz | 3 |
xd | cy |
xyy | ld |
lf | cy |
zz | cy |
解决方法与上面一致:(左右表)
代码如下:
package reverseIndex;
import java.io.IOException;
import java.util.Iterator;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class multiJoin {
public static int time = 0;
public static class Map extends Mapper<LongWritable,Text,Text,Text>{
public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException{
String mapKey = new String();
String mapValue = new String();
String line = value.toString();
String relationType = new String();
StringTokenizer it = new StringTokenizer(line);
if(line.contains("studentName")||line.contains("courseId")){
return;
}
int i=0;
while(it.hasMoreTokens()){
String token = it.nextToken();
//如果是课程编号在前,就将作为右表,若是人名在前面就是作为左表
if(token.charAt(0)>='0'&&token.charAt(0)<='9'){
mapKey = token;
if(i>0){
relationType = "1";
}else{
relationType = "2";
}
continue;
}
mapValue += token+" ";
i++;
}
//完后左右表的输出
context.write(new Text(mapKey),new Text(relationType+"+"+mapValue));
}
}
public static class Reduce extends Reducer<Text,Text,Text,Text>{
public void reduce(Text key,Iterable<Text>values,Context context) throws IOException, InterruptedException{
if(time == 0){
context.write(new Text("student"),new Text("teacher"));
time++;
}
int studentnum = 0;
int teachernum = 0;
String[] student = new String[10];
String[] teacher = new String[10];
Iterator<Text> it = values.iterator();
while(it.hasNext()){
String record = it.next().toString();
int len = record.length();
int i=2;//忽略relationType和+号
if(0 == len){
return;
}
char relationType = record.charAt(0);
if('1' == relationType){
student[studentnum] = record.substring(i);
studentnum++;
}
if('2'==relationType){
teacher[teachernum] = record.substring(i);
teachernum++;
}
}
if(0!=studentnum && 0!=teachernum){
for(int m=0;m<studentnum;m++){
for(int n=0;n<teachernum;n++){
context.write(new Text(student[m]), new Text(teacher[n]));
}
}
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// TODO Auto-generated method stub
Configuration conf = new Configuration();
Job job = new Job(conf,"multiJoin");
job.setJarByClass(multiJoin.class);
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true)?0:1);
}
}