1. 前言
大家应该比较熟悉SQL的Join操作:
为了得到完整的结果,我们需要从两个或更多的表中获取结果,我们就需要执行 join,数据库中的表可通过键将彼此联系起来。
但是在大数据场景下使用MapReduce编程模型实现join比较繁琐,当然在实际情况下我们可以借助Hive、Spark SQl等框架来实现join,但是对于join的实现原理还是需要掌握的,这对于理解join的底层实现是很有帮助的。
2. 需求分析
Persons表:
请注意,“Id_P” 列是 Persons 表中的的主键。这意味着没有两行能够拥有相同的 Id_P。即使两个人的姓名完全相同,Id_P 也可以区分他们。
Orders表:
请注意,“Id_O” 列是 Orders 表中的的主键,同时,“Orders” 表中的 “Id_P” 列用于引用 “Persons” 表中的人,而无需使用他们的确切姓名。
需求:我们可以通过引用两个表的方式,从两个表中获取数据:谁订购了产品,并且他们订购了什么产品。
具体的SQL语句为:
SELECT Persons.LastName, Persons.FirstName, Orders.OrderNo
FROM Persons, Orders
WHERE Persons.Id_P = Orders.Id_P
结果集:
下面我们将用MapReduce的方式去实现上面的结果。
3. 实现原理
- Map端读取所有文件,并在输出的内容里添加标示,代表数据是从哪个文件里来的。
- 在reduce处理函数中,按照标识对数据进行处理。
- 然后根据key用join来求出结果直接输出。
4. 实现代码
4.1 上传数据
Persons:
Orders:
上传文件:
4.2 程序源码
ResultTable类:
package com.mapreduce.join;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.StringTokenizer;
public class ResultTable implements WritableComparable {
private String perLastName = "";
private String perFirstName = "";
private String ordNo = "";
private int flag = 0;
public ResultTable(){}
public ResultTable(String perLastName, String perFirstName,
String ordNo, int flag){
this.perFirstName = perFirstName;
this.perLastName = perLastName;
this.ordNo = ordNo;
this.flag = flag;
}
public ResultTable(ResultTable r){
this.perLastName = r.perLastName;
this.perFirstName = r.perFirstName;
this.ordNo = r.ordNo;
this.flag = r.flag;
}
public String getPerLastName() {
return perLastName;
}
public void setPerLastName(String perLastName) {
this.perLastName = perLastName;
}
public String getPerFirstName() {
return perFirstName;
}
public void setPerFirstName(String perFirstName) {
this.perFirstName = perFirstName;
}
public String getOrdNo() {
return ordNo;
}
public void setOrdNo(String ordNo) {
this.ordNo = ordNo;
}
public int getFlag() {
return flag;
}
public void setFlag(int flag) {
this.flag = flag;
}
//控制reduce的输出结果
public String toString(){
return this.perLastName + "\t" + this.perFirstName + "\t" + this.ordNo;
}
@Override
public int compareTo(Object o) {
return 0;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(this.perLastName);
out.writeUTF(this.perFirstName);
out.writeUTF(ordNo);
out.writeInt(this.flag);
}
@Override
public void readFields(DataInput in) throws IOException {
this.perLastName = in.readUTF();
this.perFirstName = in.readUTF();
this.ordNo = in.readUTF();
this.flag = in.readInt();
}
}
MyMapper类:
package com.mapreduce.join;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class MyMapper extends Mapper<LongWritable, Text, LongWritable, ResultTable> {
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String val = value.toString();
String[] strs = val.split("\t");
if(strs.length <=3 ){ // Orders表
ResultTable resultTable = new ResultTable();
resultTable.setOrdNo(strs[1]);//第2列
resultTable.setFlag(1);//用1来标记Orders表
//strs[2] 为Orders表的Id_P字段
context.write(new LongWritable(Long.valueOf(strs[2])), resultTable);
}else{ //Person表
ResultTable resultTable = new ResultTable();
resultTable.setPerLastName(strs[1]);
resultTable.setPerFirstName(strs[2]);
resultTable.setFlag(0); //用0来标记Person表
context.write(new LongWritable(Long.valueOf(strs[0])), resultTable);
}
}
}
MyReducer类:
package com.mapreduce.join;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class MyReducer extends Reducer<LongWritable, ResultTable, NullWritable, Text> {
@Override
protected void reduce(LongWritable key, Iterable<ResultTable> iterables, Context context)
throws IOException, InterruptedException {
List<ResultTable> list = new ArrayList<ResultTable>();
ResultTable person = null; //如果person为空,说明orders表中的id_P在persons表中不存在
for(ResultTable tmp:iterables){
if(tmp.getFlag() == 0){ //persons表
person = new ResultTable(tmp); //person只可能有一个
}else{
ResultTable order = new ResultTable(tmp);
list.add(order); //order可能有多个加入到list中
}
}
if(person != null){
for(ResultTable order:list){
person.setOrdNo(order.getOrdNo());
context.write(NullWritable.get(), new Text(person.toString()));
}
}
}
}
JoinApp类:
package com.mapreduce.join;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.net.URI;
public class JoinApp {
private static final String INPUT_PATH = "hdfs://master001:9000/inputjoin";
private static final String OUTPUT_PATH = "hdfs://master001:9000/outputjoin";
public static void main(String[] args) throws Exception {
System.setProperty("HADOOP_USER_NAME", "hadoop");
Configuration conf = new Configuration();
//提升代码的健壮性
final FileSystem fileSystem = FileSystem.get(URI.create(INPUT_PATH), conf);
if (fileSystem.exists(new Path(OUTPUT_PATH))) {
fileSystem.delete(new Path(OUTPUT_PATH), true);
}
Job job = Job.getInstance(conf, "Join");
//run jar class 主方法
job.setJarByClass(JoinApp.class);
//设置map
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(ResultTable.class);
//设置reduce
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
//设置input format
job.setInputFormatClass(TextInputFormat.class);
FileInputFormat.addInputPath(job, new Path(INPUT_PATH));
//设置output format
job.setOutputFormatClass(TextOutputFormat.class);
FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));
//提交job
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}