MapReduce 基础案例之多表连接 Map端 Join 实现

最新推荐文章于 2022-07-31 20:29:24 发布

萌萌哒的理工男

最新推荐文章于 2022-07-31 20:29:24 发布

阅读量1.7k

点赞数 1

分类专栏： Hadoop # MapReduce 文章标签： MapReduce 多表连接 map端 join 案例

本文链接：https://blog.csdn.net/qq_38924171/article/details/100546580

版权

Hadoop 同时被 2 个专栏收录

22 篇文章 1 订阅

订阅专栏

MapReduce

8 篇文章 0 订阅

订阅专栏

多表连接

对多个具有关联性的表，对其中原始数据进行一定的处理，从其中挖掘出关心的信息，常用来进行多表之间的连接查询

本帖为Map端Join来实现多表连接，这也是最为高效的join方法

思想

将小表进行分布式缓存，在map-task阶段读取缓存文件数据存储到内存数据结构中，以供reduce阶段连接查找。

适用场景

有一个或者多个小表(文件)

优点

将小表缓存，可以高效查询；由于在map阶段进行连接，所以将会大大减小map到reduce端的数据传输，从而减少不必要的shuffle耗时，提高整个mr的执行效率

缺点

如果业务全是大表不适合

举例

login.txt

uid sexid logindate

1   1   2017-04-17
2   2   2017-04-15
3   1   2017-04-16
4   2   2017-04-14
5   1   2017-04-13
6   2   2017-04-13
7   1   2017-04-12
8   2   2017-04-11
9   0   2017-04-10

user.txt

uid uname

1 小红
2 小行
3 小通
4 小闪
5 小镇
6 小振
7 小秀
8 小微
9 小懂
10 小明
11 小刚
12 小举
13 小黑
14 小白
15 小鹏
16 小习

sex.txt

sexid sex

0 未知
1 男
2 女

样例输出

代码


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;

public class MapSideJoin {

    public static class MyMapper extends Mapper<Object, Text, Text, NullWritable>{
        public Text k = new Text();
        //读取缓存文件,并按照文件名称读取到map或者别的数据结构中
        //定义一个存储sex缓存的数据结构
        Map<String,String> sexMap = new HashMap<>();
        Map<String,String> userMap = new HashMap<>();
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            Path[] paths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
            for (Path path : paths) {
                String filename = path.getName();
                BufferedReader br = null;
                if(filename.equals("sex.txt")){
                    br = new BufferedReader(new FileReader(new File(path.toString())));
                    while(br.ready()){
                        String row = br.readLine();
                        String[] sexs = row.split(" ");
                        sexMap.put(sexs[0],sexs[1]);
                    }
                }else if(filename.equals("user.txt")){
                    br = new BufferedReader(new FileReader(new File(path.toString())));
                    while(br.ready()){
                        String row = br.readLine();
                        String[] names = row.split(" ");
                        userMap.put(names[0],names[1]);
                    }
                }
                if(br != null){
                    br.close();
                }
            }
        }
        //读取login.txt进行处理
        @Override
        protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            String row = value.toString();
            String[] values = row.split(" ");
            String uid = values[0];
            String sexid = values[1];
            String uname = userMap.getOrDefault(uid,"");
            String sex = sexMap.getOrDefault(sexid,"");
            k.set(uname + "\t" + sex + "\t" + values[2]);
            context.write(k,NullWritable.get());
        }
    }

    public static class MyReducer extends Reducer<Text, NullWritable, Text, NullWritable>{
        @Override
        protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
            context.write(key,NullWritable.get());
        }
    }

    public static void main(String[] args) throws IOException {
        try {
            //Configuration conf = HdfsUtils.getConf();
            Job job = Job.getInstance(new Configuration(),"myMapSideJoin");
            job.setJarByClass(MapSideJoin.class);
            job.setMapperClass(MyMapper.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(NullWritable.class);
            job.setReducerClass(MyReducer.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(NullWritable.class);
            //设置缓存
            job.addCacheFile(new URI("file://hadoop01:9000/sex.txt"));
            job.addCacheFile(new URI("hdfs://hadoop01:9000/user.txt"));

            FileInputFormat.addInputPath(job,new Path("/input/mapjoin/login.txt"));
            FileOutputFormat.setOutputPath(job,new Path("/output/mr/mapjoin/test1/"));
            int success = job.waitForCompletion(true) ? 0 : 1;
            System.exit(success);
        } catch (InterruptedException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }catch (URISyntaxException e){
            e.printStackTrace();
        }
    }
}