联接
使用案例
Table EMP:
Name Sex Age DepNo
zhang male 20 1
li female 25 2
wang female 30 3
zhou male 35 2
Table DEP:
DepNo DepName
1 Sales
2 Dev
3 Mgt
reduce端联接比map端联接更普遍,因为输入的数据不需要特定的结构;但是效率低,因为所有数据必须经过shuffle过程
基本思路:
1)Map端读取所有文件,并在输出的内容里加上标识代表数据是从哪个文件里来的
2)在reduce处理函数里,按照标识对数据进行保存
3)然后根据key的join来求出结果直接输出
定义EMP_DEP.java实体bean保存表中数据
package com.join;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class EMP_DEP implements WritableComparable{
private String name = "" ;
private String sex = "" ;
private int age = 0 ;
private int depNo = 0 ;
private String depName = "" ;
private String table = "" ;
public EMP_DEP(){
}
public EMP_DEP(EMP_DEP emp_dep){
this.name = emp_dep.getName() ;
this.sex = emp_dep.getSex() ;
this.age = emp_dep.getAge() ;
this.depNo = emp_dep.getDepNo() ;
this.depName = emp_dep.getDepName() ;
this.table = emp_dep.getTable() ;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getSex() {
return sex;
}
public void setSex(String sex) {
this.sex = sex;
}
public int getAge() {
return age;
}
public void setAge(int age) {
this.age = age;
}
public int getDepNo() {
return depNo;
}
public void setDepNo(int depNo) {
this.depNo = depNo;
}
public String getDepName() {
return depName;
}
public void setDepName(String depName) {
this.depName = depName;
}
public String getTable() {
return table;
}
public void setTable(String table) {
this.table = table;
}
@Override
public void readFields(DataInput in) throws IOException {
this.name = in.readUTF() ;
this.sex = in.readUTF() ;
this.age = in.readInt() ;
this.depNo = in.readInt() ;
this.depName = in.readUTF() ;
this.table = in.readUTF() ;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(name) ;
out.writeUTF(sex) ;
out.writeInt(age) ;
out.writeInt(depNo) ;
out.writeUTF(depName) ;
out.writeUTF(table) ;
}
@Override
public int compareTo(Object o) {
return 0;
}
public String toString(){
return name + " " + sex + " " + age + " " + depName ;
}
}
定义Mapper函数
package com.join;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class ReduceSideMapper extends Mapper<LongWritable, Text, IntWritable, EMP_DEP> {
private EMP_DEP emp_dep = new EMP_DEP() ;
@Override
protected void map(LongWritable key, Text value,Context context)
throws IOException, InterruptedException {
String[] values = value.toString().split("\\s+") ;
if(values.length == 4 ){
emp_dep.setName(values[0]) ;
emp_dep.setSex(values[1]) ;
emp_dep.setAge(Integer.valueOf(values[2])) ;
emp_dep.setDepNo(Integer.valueOf(values[3])) ;
emp_dep.setTable("EMP") ;
context.write(new IntWritable(Integer.valueOf(values[3])), emp_dep) ;
}
if(values.length ==2 ){
emp_dep.setDepNo(Integer.valueOf(values[0])) ;
emp_dep.setDepName(values[1]) ;
emp_dep.setTable("DEP") ;
context.write(new IntWritable(Integer.valueOf(values[0])), emp_dep) ;
}
}
}
定义reducer函数
package com.join;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class ReduceSideReducer extends
Reducer<IntWritable, EMP_DEP, NullWritable, EMP_DEP> {
@Override
protected void reduce(IntWritable key, Iterable<EMP_DEP> value,Context context)
throws IOException, InterruptedException {
String depName = "" ;
List<EMP_DEP> list = new LinkedList<EMP_DEP>() ;
for(EMP_DEP val : value){
//在DEP表中,depNo是主键,所以只会存在一个
if(val.getTable().equals("DEP")){
depName = val.getDepName() ;
}else{
list.add(new EMP_DEP(val)) ;
}
}
for(EMP_DEP v : list){
if(v.getTable().equals("EMP")){
v.setDepName(depName) ;
context.write(NullWritable.get(), v) ;
}
}
}
}
定义TestReduceSideJoin
package com.join;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class TestReduceSideJoin {
public static void main(String args[]) throws Exception{
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount <in> <out>");
System.exit(2);
}
Job job = new Job(conf, "Reduce side join");
job.setJarByClass(TestReduceSideJoin.class);
job.setMapperClass(ReduceSideMapper.class);
job.setReducerClass(ReduceSideReducer.class);
job.setMapOutputKeyClass(IntWritable.class) ;
job.setMapOutputValueClass(EMP_DEP.class) ;
job.setOutputKeyClass(NullWritable.class) ;
job.setOutputValueClass(EMP_DEP.class) ;
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}