多表连接
对多个具有关联性的表,对其中原始数据进行一定的处理,从其中挖掘出关心的信息,常用来进行多表之间的连接查询
本帖为Map端Join来实现多表连接,这也是最为高效的join方法
思想
将小表进行分布式缓存,在map-task阶段读取缓存文件数据存储到内存数据结构中,以供reduce阶段连接查找。
适用场景
有一个或者多个小表(文件)
优点
将小表缓存,可以高效查询;由于在map阶段进行连接,所以将会大大减小map到reduce端的数据传输,从而减少不必要的shuffle耗时,提高整个mr的执行效率
缺点
如果业务全是大表不适合
举例
login.txt
uid sexid logindate
1 1 2017-04-17
2 2 2017-04-15
3 1 2017-04-16
4 2 2017-04-14
5 1 2017-04-13
6 2 2017-04-13
7 1 2017-04-12
8 2 2017-04-11
9 0 2017-04-10
user.txt
uid uname
1 小红
2 小行
3 小通
4 小闪
5 小镇
6 小振
7 小秀
8 小微
9 小懂
10 小明
11 小刚
12 小举
13 小黑
14 小白
15 小鹏
16 小习
sex.txt
sexid sex
0 未知
1 男
2 女
样例输出
代码
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;
public class MapSideJoin {
public static class MyMapper extends Mapper<Object, Text, Text, NullWritable>{
public Text k = new Text();
//读取缓存文件,并按照文件名称读取到map或者别的数据结构中
//定义一个存储sex缓存的数据结构
Map<String,String> sexMap = new HashMap<>();
Map<String,String> userMap = new HashMap<>();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
Path[] paths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
for (Path path : paths) {
String filename = path.getName();
BufferedReader br = null;
if(filename.equals("sex.txt")){
br = new BufferedReader(new FileReader(new File(path.toString())));
while(br.ready()){
String row = br.readLine();
String[] sexs = row.split(" ");
sexMap.put(sexs[0],sexs[1]);
}
}else if(filename.equals("user.txt")){
br = new BufferedReader(new FileReader(new File(path.toString())));
while(br.ready()){
String row = br.readLine();
String[] names = row.split(" ");
userMap.put(names[0],names[1]);
}
}
if(br != null){
br.close();
}
}
}
//读取login.txt进行处理
@Override
protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
String row = value.toString();
String[] values = row.split(" ");
String uid = values[0];
String sexid = values[1];
String uname = userMap.getOrDefault(uid,"");
String sex = sexMap.getOrDefault(sexid,"");
k.set(uname + "\t" + sex + "\t" + values[2]);
context.write(k,NullWritable.get());
}
}
public static class MyReducer extends Reducer<Text, NullWritable, Text, NullWritable>{
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key,NullWritable.get());
}
}
public static void main(String[] args) throws IOException {
try {
//Configuration conf = HdfsUtils.getConf();
Job job = Job.getInstance(new Configuration(),"myMapSideJoin");
job.setJarByClass(MapSideJoin.class);
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//设置缓存
job.addCacheFile(new URI("file://hadoop01:9000/sex.txt"));
job.addCacheFile(new URI("hdfs://hadoop01:9000/user.txt"));
FileInputFormat.addInputPath(job,new Path("/input/mapjoin/login.txt"));
FileOutputFormat.setOutputPath(job,new Path("/output/mr/mapjoin/test1/"));
int success = job.waitForCompletion(true) ? 0 : 1;
System.exit(success);
} catch (InterruptedException e) {
e.printStackTrace();
} catch (ClassNotFoundException e) {
e.printStackTrace();
}catch (URISyntaxException e){
e.printStackTrace();
}
}
}