多表关联
实例描述:
输入是两个文件,一个代表工厂表,包含工厂名列和地址编号列;另一个代表地址表,包含地址名列和地址编号列。要求从输入数据中找出工厂名和地址名的对应关系,输出"工厂名——地址名"表。
样例输入:
factoryname addressed
Beijing Red Star 1
Shenzhen Thunder 3
Guangzhou Honda 2
Beijing Rising 1
Guangzhou Development Bank 2
Tencent 3
Back of Beijing 1
Beijing Red Star 1
Shenzhen Thunder 3
Guangzhou Honda 2
Beijing Rising 1
Guangzhou Development Bank 2
Tencent 3
Back of Beijing 1
addressID addressname
1 Beijing
2 Guangzhou
3 Shenzhen
4 Xian
1 Beijing
2 Guangzhou
3 Shenzhen
4 Xian
期望输出:
factoryname addressname
Back of Beijing Beijing
Beijing Red Star Beijing
Beijing Rising Beijing
Guangzhou Development Bank Guangzhou
Guangzhou Honda Guangzhou
Shenzhen Thunder Shenzhen
Tencent Shenzhen
Back of Beijing Beijing
Beijing Red Star Beijing
Beijing Rising Beijing
Guangzhou Development Bank Guangzhou
Guangzhou Honda Guangzhou
Shenzhen Thunder Shenzhen
Tencent Shenzhen
多表关联和单表关联类似,都类似于数据库中的自然连接。相比较单表连接,多表连接的左右表和连接列分的更清楚。所以用和单表类似的方法,map识别出输入的行属于哪个表之后,对其进行分割,将连接的列值保存在key中,另一列和左右表标识保存在value中,然后输出。reduce拿到连接结果之后,解析value内容,根据标志将左右表内容分开存放,然后求笛卡尔积,最后直接输出。
代码:
package mapreduce;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import mapreduce.sort.MyMapper;
import mapreduce.sort.MyReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
public class SST {
static String INPUT_PATH="hdfs://master:9000/show/";
static String OUTPUT_PATH="hdfs://master:9000/output";
static class MyMapper extends Mapper<Object, Object, Text, Text>{
Text output_key=new Text();
Text output_value=new Text();
String tableName="";
protected void setup(Context context) throws java.io.IOException, java.lang.InterruptedException{
FileSplit fs= (FileSplit) context.getInputSplit();
tableName=fs.getPath().getName();
System.out.println(tableName);
}
protected void map(Object key,Object value,Context context) throws IOException, InterruptedException{
String[] str=value.toString().split(",");
if(tableName.equals("file1")){
output_key.set(str[1]);
output_value.set(1+","+str[0]+","+str[1]);
}
else if(tableName.equals("file2")){
output_key.set(str[0]);
output_value.set(2+","+str[0]+","+str[1]);
}
context.write(output_key,output_value);
System.out.println(str[0]+str[1]);
}
}
static class MyReduce extends Reducer<Text, Text, Text, Text>{
Text outputkey=new Text();
Text outputvalue=new Text();
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException{
List<String> facn=new ArrayList();
List<String> addr=new ArrayList();
for(Text line:values){
String[] str=line.toString().split(",");
System.out.println(str[0]);
if(str[0].equals("1")){
facn.add(str[1]);
System.out.println("===="+str[1]);
}
else if(str[0].equals("2")){
addr.add(str[2]);
}
}
for(String a:facn)
for(String b:addr){
outputkey.set(a);
outputvalue.set(b);
context.write(outputkey, outputvalue);
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// TODO Auto-generated method stub
Path outputpath=new Path(OUTPUT_PATH);
Configuration conf=new Configuration();
FileSystem fs=outputpath.getFileSystem(conf);
Job job=Job.getInstance(conf);
FileInputFormat.setInputPaths(job, INPUT_PATH);
FileOutputFormat.setOutputPath(job,outputpath);
if( fs.exists(outputpath)){
fs.delete(outputpath);
//System.out.print("success");
}
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReduce.class);
// job.setPartitionerClass(MyPartitioner.class);
// job.setNumReduceTasks(2);
// job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.waitForCompletion(true);
}
}
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import mapreduce.sort.MyMapper;
import mapreduce.sort.MyReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
public class SST {
static String INPUT_PATH="hdfs://master:9000/show/";
static String OUTPUT_PATH="hdfs://master:9000/output";
static class MyMapper extends Mapper<Object, Object, Text, Text>{
Text output_key=new Text();
Text output_value=new Text();
String tableName="";
protected void setup(Context context) throws java.io.IOException, java.lang.InterruptedException{
FileSplit fs= (FileSplit) context.getInputSplit();
tableName=fs.getPath().getName();
System.out.println(tableName);
}
protected void map(Object key,Object value,Context context) throws IOException, InterruptedException{
String[] str=value.toString().split(",");
if(tableName.equals("file1")){
output_key.set(str[1]);
output_value.set(1+","+str[0]+","+str[1]);
}
else if(tableName.equals("file2")){
output_key.set(str[0]);
output_value.set(2+","+str[0]+","+str[1]);
}
context.write(output_key,output_value);
System.out.println(str[0]+str[1]);
}
}
static class MyReduce extends Reducer<Text, Text, Text, Text>{
Text outputkey=new Text();
Text outputvalue=new Text();
protected void reduce(Text key, Iterable<Text> values, Context context)
throws IOException, InterruptedException{
List<String> facn=new ArrayList();
List<String> addr=new ArrayList();
for(Text line:values){
String[] str=line.toString().split(",");
System.out.println(str[0]);
if(str[0].equals("1")){
facn.add(str[1]);
System.out.println("===="+str[1]);
}
else if(str[0].equals("2")){
addr.add(str[2]);
}
}
for(String a:facn)
for(String b:addr){
outputkey.set(a);
outputvalue.set(b);
context.write(outputkey, outputvalue);
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// TODO Auto-generated method stub
Path outputpath=new Path(OUTPUT_PATH);
Configuration conf=new Configuration();
FileSystem fs=outputpath.getFileSystem(conf);
Job job=Job.getInstance(conf);
FileInputFormat.setInputPaths(job, INPUT_PATH);
FileOutputFormat.setOutputPath(job,outputpath);
if( fs.exists(outputpath)){
fs.delete(outputpath);
//System.out.print("success");
}
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReduce.class);
// job.setPartitionerClass(MyPartitioner.class);
// job.setNumReduceTasks(2);
// job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.waitForCompletion(true);
}
}