多表关联和单表关联类似,它也是通过对原始数据进行一定的处理,从其中挖掘出关心的信息
1 实例描述
输入是两个文件,一个代表工厂表,包含工厂名列和地址编号列;另一个代表地址表,包含地址名列和地址编号列。要求从输入数据中找出工厂名和地址名的对应关系,输出”工厂名——地址名”表
地址表
addressID addressname
1 Beijing
2 Guangzhou
3 Shenzhen
4 Xian
工厂表
factoryname addressed
Beijing Red Star 1
Shenzhen Thunder 3
Guangzhou Honda 2
Beijing Rising 1
Guangzhou Development Bank 2
Tencent 3
Back of Beijing 1
期望输出:
factoryname addressname
Back of Beijing Beijing
Beijing Red Star Beijing
Beijing Rising Beijing
Guangzhou Development Bank Guangzhou
Guangzhou Honda Guangzhou
Shenzhen Thunder Shenzhen
Tencent Shenzhen
2 问题分析
多表关联和单表关联相似,都类似于数据库中的自然连接。相比单表关联,多表关联的左右表和连接列更加清楚。所以可以采用和单表关联的相同的处理方式,map识别出输入的行属于哪个表之后,对其进行分割,将连接的列值保存在key中,另一列和左右表标识保存在value中,然后输出。reduce拿到连接结果之后,解析value内容,根据标志将左右表内容分开存放,然后求笛卡尔积,最后直接输出。
3 代码
package mr;
import java.net.URI;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MyFactory {
static class MyMapper extends Mapper<LongWritable, Text, Text, Text>{
public void map(LongWritable k1, Text v1, Context context)
throws java.io.IOException, java.lang.InterruptedException
{
int i = 0;
String[] lines= v1.toString().split(",");
if((lines.length != 2 && lines[0].equals("factoryname") )||
(lines.length != 2 && lines[0].equals("addressID")))return;
if(lines[0].charAt(0) >= '0' && lines[0].charAt(0) <= '9')i++;
if(i < 1){
String f_name = lines[0];
String f_address = lines[1];
context.write(new Text(f_address), new Text("0"+","+f_name+","+f_address));
System.out.println(f_address+f_name);
}
else{
String a_id = lines[0];
String a_name = lines[1];
context.write(new Text(a_id), new Text("1"+","+a_name+","+a_id));
}
System.out.println("map......");
}
}
static class MyReduce extends Reducer<Text, Text, Text, Text>{
public void reduce(Text key, Iterable<Text> values, Context context) throws java.io.IOException, java.lang.InterruptedException
{
List<String> f_name = new ArrayList();
List<String> a_name = new ArrayList();
Iterator<Text> it = values.iterator();
while(it.hasNext()){
String line = it.next().toString();
System.out.println(key+" "+line);
String[] words = line.split(",");
if(words[0].equals("0")){
f_name.add(words[1]);
}
else if(words[0].equals("1")){
a_name.add(words[1]);
}
else return;
}
System.out.println("reduce......");
for (String a : a_name) {
for (String f : f_name) {
context.write(new Text(f), new Text(a));
}
}
}
}
private static String INPUT_PATH="hdfs://master:9000/input/factory.dat";
private static String INPUT_PATH1="hdfs://master:9000/input/address.dat";
private static String OUTPUT_PATH="hdfs://master:9000/output/c/";
public static void main(String[] args) throws Exception {
Configuration conf=new Configuration();
FileSystem fs=FileSystem.get(new URI(OUTPUT_PATH),conf);
if(fs.exists(new Path(OUTPUT_PATH)))
fs.delete(new Path(OUTPUT_PATH));
Job job=new Job(conf,"myjob");
job.setJarByClass(MyGL.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.addInputPath(job,new Path(INPUT_PATH));
FileInputFormat.addInputPath(job,new Path(INPUT_PATH1));
FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));
job.waitForCompletion(true);
}
}