【十八掌●武功篇】第七掌：MapReduce之join详解

最新推荐文章于 2021-11-29 11:20:52 发布

鸣宇淳

最新推荐文章于 2021-11-29 11:20:52 发布

阅读量456

点赞数

分类专栏：大数据技术文章标签： MapReduce

本文链接：https://blog.csdn.net/chybin500/article/details/79016825

版权

大数据技术专栏收录该内容

41 篇文章 9 订阅

订阅专栏

这一篇博文是【大数据技术●降龙十八掌】系列文章的其中一篇，点击查看目录：大数据技术●降龙十八掌

一、Reduce Join

reduce端的join操作大概是所有mapreduce join操作中最简单的一种，它通过某一个外键将多个数据集连接起来，可以非常容易地实现inner join、left join、right join、full join，并且它对参与连接的数据集大小没有限制，还有它可以一次连接任意多个数据集。如果参与连接的数据量都特别大，可能reduce端的join是唯一可以用的方法。

reduce端的join缺点就是要将大量的数据传送到reduce端进行join操作，所以会消耗大量的网络带宽来传输数据。

1、reduce join的过程

mapper从每个数据集中读取每条记录数据，有几个数据集就有几类mapper，从记录中抽取其外键做为key值，整条记录做为value,同时输出值通过一个标识来标记来源于哪个数据集。
经过map的shuffle后，通过网络传输，将mapper输出值传递给reduce所在节点。
reduce shuffle将各个map传递过来的数据进行排序分组后，形成一个外键值一组数据，一组数据执行一次reduce函数。
在reduce函数中，将当前键下的带不同标识的数据存入不同的数组中，比如将A文件的数据存入listA,将B文件来的数据存入listB，然后根据join类型（inner join、left join、rigth join、full join）来进行连接操作。
操作结果存入part文件，part文件的数量和reduce的个数一致。

2、reduce 实例


package mapreduce.join;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.ToolRunner;

/**
 * Created by 鸣宇淳 on 2018/1/9.
 */
public class UserJoinMain {
    public static void main(String[] args) throws Exception {
        Configuration configuration = new Configuration();
        int status = ToolRunner.run(configuration, new ReduceJoinMapReduce(), args);
        System.exit(status);
    }
}


package mapreduce.join;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * Created by 鸣宇淳 on 2018/1/9.
 * 以下这个例子是两个文件进行MapReduce Join的示例，包括了inner join、left join、right join、full join
 * MapReduce程序读取两个文件pv、province，用省份ID做为外键进行连接
 * 第一个文件中的列有：省份ID(provinceId)、pv数(pv)。
 * 第二个文件中的列有：ID、省份ID(provinceId)、省份名称(proveiceName)。
 * 两个文件中的列之间都是以逗号(,)隔开。
 * ---------------------pv文件内容开始-------------------
 * 110000,54878513
 * 120000,16596320
 * 130000,17829678
 * 420000,14703733
 * 430000,12112778
 * 440000,54250547
 * 450000,8231348
 * 460000,1701320
 * 500000,6104667
 * 510000,17313608
 * 520000,5361314
 * 530000,5987739
 * 540000,419185
 * 610000,10126813
 * 620000,3200773
 * 630000,953499
 * 640000,1163306
 * 650000,3168712
 * 820000,71672
 * 990000,18698496
 * ---------------------pv文件内容结束-------------------
 * <p>
 * ---------------------province文件内容开始-------------------
 * 1,110000,北京
 * 2,120000,天津
 * 3,130000,河北
 * 4,140000,山西
 * 6,210000,辽宁
 * 7,220000,吉林
 * 8,230000,黑龙江
 * 25,530000,云南
 * 30,640000,宁夏
 * 31,650000,新疆
 * 32,710000,台湾
 * 33,810000,香港
 * 34,820000,澳门
 * 35,910000,海外
 * 36,990000,其它
 * * ---------------------province文件内容结束-------------------
 */
public class ReduceJoinMapReduce extends Configured implements Tool {

    public int run(String[] args) throws Exception {
        //获取配置
        Configuration configuration = this.getConf();

        //接收参数，指定是哪种类型的join
        configuration.set("join.type", args[4]);

        //创建job
        Job job = Job.getInstance(configuration, ReduceJoinMapReduce.class.getSimpleName());
        //指定MapReduce主类
        job.setJarByClass(ReduceJoinMapReduce.class);
        //指定输入路径和处理的Mapper类，不同的文件使用不同的Mapper处理
        MultipleInputs.addInputPath(job, new Path(args[0]), TextInputFormat.class, UserJoinMapperA.class);
        MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, UserJoinMapperB.class);
        //指定输出路径
        Path outpath = new Path(args[2]);
        FileOutputFormat.setOutputPath(job, outpath);
        //定义Map输出类型
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(JoinValue.class);
        //定义Reducer类
        job.setReducerClass(ReduceJoinMapReduce.UserJoinReducer.class);
        //定义输出类型
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(Text.class);

        //指定reduce个数
        job.setNumReduceTasks(Integer.valueOf(args[3]));
        boolean isSucces = job.waitForCompletion(true);
        return isSucces ? 0 : 1;
    }

    public static class UserJoinMapperA extends Mapper<LongWritable, Text, IntWritable, JoinValue> {
        /*
        这个map处理pv文件的内容，输出的key为provinceId，值为JoinValue类型的
         */
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //每一行都是以逗号隔开的
            String[] list = value.toString().split(",");
            //不合法的数据过滤掉
            if (list.length != 2) {
                return;
            }
            //第一列是省份ID
            int provinceId = Integer.parseInt(list[0]);
            //map输出的值类型
            JoinValue joinValue = new JoinValue();
            //join的键
            joinValue.setJoinKey(new IntWritable(provinceId));
            //join输出的值，用\t隔开的列
            joinValue.setLineText(new Text(list[0] + "\t" + list[1]));
            //将来源文件为pv的数据打上标记标志A。
            joinValue.setFlag(new Text("A"));
            //输出map结果，结果里provinceId为键。
            context.write(joinValue.getJoinKey(), joinValue);
        }
    }


    public static class UserJoinMapperB extends Mapper<LongWritable, Text, IntWritable, JoinValue> {
        /*
               这个map处理province文件的内容，输出的key为provinceId，值为JoinValue类型的
                */
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //每一行都是以逗号隔开的
            String[] list = value.toString().split(",");
            //过滤不合法的数据
            if (list.length != 3) {
                return;
            }
            //第二列是省份ID
            int provinceId = Integer.parseInt(list[1]);
            //map输出的值类型
            JoinValue joinValue = new JoinValue();
            //join的键为provinceId
            joinValue.setJoinKey(new IntWritable(provinceId));
            joinValue.setLineText(new Text(list[1] + "\t" + list[2]));
            //将来源文件为province的数据打上标记标志B。
            joinValue.setFlag(new Text("B"));
            //输出map结果，结果里provinceId为主键。
            context.write(joinValue.getJoinKey(), joinValue);
        }
    }

    /*
    Reduce里进行join
     */
    public static class UserJoinReducer extends Reducer<IntWritable, JoinValue, IntWritable, Text> {
        //join的类型，是从外部传递过来的
        private String joinType = null;
        //Reduce里用来保存来源文件为Pv的数据
        List<JoinValue> listA = new ArrayList<JoinValue>();
        //Reduce里用来保存来源文件为province的数据
        List<JoinValue> listB = new ArrayList<JoinValue>();

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            //接收参数
            joinType = context.getConfiguration().get("join.type");
        }

        @Override
        protected void reduce(IntWritable key, Iterable<JoinValue> values, Context context) throws IOException, InterruptedException {
            listA.clear();
            listB.clear();

            //将相同键的数据，将数据按照标识进行分类
            for (JoinValue item : values) {
                if (item.getFlag().toString().equals("A")) {
                    //如果数据来源于pv，将item的副本（注意这里需要克隆数据，不然保存的就是指针）放入列表listA
                    listA.add(item.clone());
                } else if (item.getFlag().toString().equals("B")) {
                    //如果数据来源于provice，将item的副本（注意这里需要克隆数据，不然保存的就是指针）放入列表listB
                    listB.add(item.clone());
                }
            }
            executeJoin(context);
        }

        /*
        进行Join，包括各种类型的Join
         */
        private void executeJoin(Context context) throws IOException, InterruptedException {
            //对每一个键下的A、B数据进行连接
            if (joinType.equalsIgnoreCase("inner")) {
                //inner join 内连接
                for (JoinValue A : listA) {
                    for (JoinValue B : listB) {
                        //如果当前键A、B都有数据，才连接后输出
                        context.write(A.getJoinKey(), new Text(A.getLineText().toString() + "\t" + B.getLineText().toString()));
                    }
                }
            } else if (joinType.equalsIgnoreCase("left")) {
                //左连接
                for (JoinValue A : listA) {
                    if (!listB.isEmpty()) {
                        //如果B中不为空，就连接
                        for (JoinValue B : listB) {
                            context.write(A.getJoinKey(), new Text(A.getLineText().toString() + "\t" + B.getLineText().toString()));
                        }
                    } else {
                        context.write(A.getJoinKey(), new Text(A.getLineText().toString() + "\t\t\t"));
                    }
                }
            } else if (joinType.equalsIgnoreCase("right")) {
                //右连接
                for (JoinValue B : listB) {
                    if (!listA.isEmpty()) {
                        //如果A不空，就连接
                        for (JoinValue A : listA) {
                            context.write(B.getJoinKey(), new Text(A.getLineText().toString() + "\t" + B.getLineText().toString()));
                        }
                    } else {
                        //为空就用空值连接
                        context.write(B.getJoinKey(), new Text("\t\t" + B.getLineText().toString()));
                    }
                }
            } else if (joinType.equalsIgnoreCase("full")) {
                //全连接
                if (!listA.isEmpty()) {
                    //如果A不为空，就先用A左连接
                    for (JoinValue A : listA) {
                        if (!listB.isEmpty()) {
                            //如果B中不为空，就连接
                            for (JoinValue B : listB) {
                                context.write(A.getJoinKey(), new Text(A.getLineText().toString() + "\t" + B.getLineText().toString()));
                            }
                        } else {
                            context.write(A.getJoinKey(), new Text(A.getLineText().toString() + "\t\t\t"));
                        }
                    }
                } else {
                    //如果A为空，就右连接
                    for (JoinValue B : listB) {
                        context.write(B.getJoinKey(), new Text("\t\t" + B.getLineText().toString()));
                    }
                }
            }
        }
    }

}

package mapreduce.join;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * Created by 鸣宇淳 on 2018/1/9.
 * 自定义一个返回类型
 */
public class JoinValue implements WritableComparable<JoinValue>, Cloneable {

    private IntWritable joinKey;//链接关键字
    private Text flag;//文件来源标志
    private Text lineText;//要输出的数据

    public void setJoinKey(IntWritable joinKey) {
        this.joinKey = joinKey;
    }

    public void setFlag(Text flag) {
        this.flag = flag;
    }

    public Text getFlag() {
        return flag;
    }

    public IntWritable getJoinKey() {
        return joinKey;
    }

    public Text getLineText() {
        return lineText;
    }

    public void setLineText(Text lineText) {
        this.lineText = lineText;
    }

    public JoinValue() {
        this.joinKey = new IntWritable();
        this.flag = new Text();
        this.lineText = new Text();
    }

    @Override
    public void write(DataOutput out) throws IOException {
        this.joinKey.write(out);
        this.flag.write(out);
        this.lineText.write(out);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.joinKey.readFields(in);
        this.flag.readFields(in);
        this.lineText.readFields(in);
    }

    @Override
    public int compareTo(JoinValue o) {
        return this.joinKey.compareTo(o.getJoinKey());
    }

    @Override
    public String toString() {
        return "[flag=" + this.flag.toString() + ",joinKey=" + this.joinKey.toString() + ",lineText=" + this.lineText.toString() + "]";
    }

    @Override
    public boolean equals(Object obj) {
        return this.getJoinKey().equals(((JoinValue) obj).getJoinKey());
    }

    /*
    需要克隆数据
     */
    @Override
    protected JoinValue clone() {
        JoinValue o = new JoinValue();
        o.setFlag(new Text(this.getFlag().toString()));
        o.setLineText(new Text(this.getLineText().toString()));
        o.setJoinKey(new IntWritable(this.getJoinKey().get()));
        return o;
    }
}

二、Map Join

Map Join是在Map端进行连接，并且不需要有reduce操作，所以非常高效，是mapreduce中最快的一种连接方式，当一个大数据集和一个或者多个小数据集之间进行连接的时候，可以使用Map Join。

但是map join需要满足一些前提条件才能使用，它需要除了一个大数据集外其他的数据集都比较小，能够读入内存中。另外就是只支持inner join或者是大数据集在左边的left join，因为其他的join类型都需要有reduce阶段。

1、map join的过程

在run函数中，将小数据集存入分布式高速缓存。
在mapper的setup方法中将缓存中的数据读取出来，放入内存。
map函数中处理每一条记录并和缓存中的数据进行连接操作。
map join没有shuffle阶段，直接输出part结果文件，part文件个数和map个数一致。

2、map join实例

package mapreduce.join;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.ToolRunner;

/**
 * Created by 鸣宇淳 on 2018/1/9.
 * 执行时用：
 * hadoop jar ~/input/orderdemo-1.0-SNAPSHOT.jar mapreduce.join.MapJoinMain /input/pv hdfs://ClusterTest/input/province /out/93 2 left
 */
public class MapJoinMain {
    public static void main(String[] args) throws Exception {
        Configuration configuration = new Configuration();
        int status = ToolRunner.run(configuration, new MapJoinMapReduce(), args);
        System.exit(status);
    }
}


package mapreduce.join;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;

/**
 * Created by 鸣宇淳 on 2018/1/9.
 * 以下这个例子是两个文件进行Map Join的示例，包括了inner join、left join
 * MapReduce程序读取两个文件pv、province，用省份ID做为外键进行连接
 * ************province文件很小，放入分布式高速缓存中*************
 * 第一个文件中的列有：省份ID(provinceId)、pv数(pv)。
 * 第二个文件中的列有：ID、省份ID(provinceId)、省份名称(proveiceName)。
 * 两个文件中的列之间都是以逗号(,)隔开。
 * ---------------------pv文件内容开始-------------------
 * 110000,54878513
 * 120000,16596320
 * 130000,17829678
 * 420000,14703733
 * 430000,12112778
 * 440000,54250547
 * 450000,8231348
 * 460000,1701320
 * 500000,6104667
 * 510000,17313608
 * 520000,5361314
 * 530000,5987739
 * 540000,419185
 * 610000,10126813
 * 620000,3200773
 * 630000,953499
 * 640000,1163306
 * 650000,3168712
 * 820000,71672
 * 990000,18698496
 * ---------------------pv文件内容结束-------------------
 * <p>
 * ---------------------province文件内容开始-------------------
 * 1,110000,北京
 * 2,120000,天津
 * 3,130000,河北
 * 4,140000,山西
 * 6,210000,辽宁
 * 7,220000,吉林
 * 8,230000,黑龙江
 * 25,530000,云南
 * 30,640000,宁夏
 * 31,650000,新疆
 * 32,710000,台湾
 * 33,810000,香港
 * 34,820000,澳门
 * 35,910000,海外
 * 36,990000,其它
 * * ---------------------province文件内容结束-------------------
 */
public class MapJoinMapReduce extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {
        //获取配置
        Configuration configuration = this.getConf();
        //接收参数，指定是哪种类型的join
        configuration.set("join.type", args[4]);
        //创建job
        Job job = Job.getInstance(configuration, MapJoinMapReduce.class.getSimpleName());
        //指定MapReduce主类
        job.setJarByClass(MapJoinMapReduce.class);
        //指定输入路径
        FileInputFormat.addInputPath(job, new Path(args[0]));
        job.setMapperClass(MapJoinMapper.class);

        //*********将小文件添加进缓存文件*********
        job.addCacheFile(new URI(args[1]));
        //指定输出路径
        Path outpath = new Path(args[2]);
        FileOutputFormat.setOutputPath(job, outpath);
        //定义Map输出类型
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Text.class);
        //定义输出类型
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(Text.class);
        //指定reduce个数
        job.setNumReduceTasks(Integer.valueOf(args[3]));
        boolean isSucces = job.waitForCompletion(true);
        return isSucces ? 0 : 1;
    }

    public static class MapJoinMapper extends Mapper<LongWritable, Text, IntWritable, Text> {
        //缓存的数据，存储在内存中
        private HashMap<Integer, String> cacheList = new HashMap<Integer, String>();
        //join的类型，是从外部传递过来的
        private String joinType = null;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            //从缓存中读取数据到内存中
            if (context.getCacheFiles() != null && context.getCacheFiles().length > 0) {
                //从HDFS中读取存储文件
                BufferedReader rdr = new BufferedReader(
                        new InputStreamReader(
                                FileSystem.get(context.getConfiguration()).open(
                                        new Path(context.getCacheFiles()[0])))
                );
                String line = null;
                while ((line = rdr.readLine()) != null) {
                    String[] list = line.split(",");
                    if (list.length != 3) {
                        continue;
                    }
                    //存入内存
                    cacheList.put(Integer.valueOf(list[1]), list[2]);
                }
                rdr.close();
            }
            //接收参数，join类型
            joinType = context.getConfiguration().get("join.type");
        }

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            //每一行都是以逗号隔开的
            String[] list = value.toString().split(",");
            //不合法的数据过滤掉
            if (list.length != 2) {
                return;
            }
            //第一列是省份ID
            int provinceId = Integer.parseInt(list[0]);
            String resultVales = list[0] + "\t" + list[1];
            //执行Join操作
            executeJoin(context, provinceId, resultVales);
        }

        /*
      进行Join，包括各种类型的Join
       */
        private void executeJoin(Mapper.Context context, int provinceId, String resultVales) throws IOException, InterruptedException {
            //内存中的缓存，小数据集
            String cacheValue = cacheList.get(provinceId);
            if (joinType.equalsIgnoreCase("inner")) {
                //inner join 内连接
                if (cacheValue != null) {
                    //小数据集里有这个key的记录才连接
                    context.write(new IntWritable(provinceId), new Text(resultVales + "\t" + cacheValue));
                }
            } else if (joinType.equalsIgnoreCase("left")) {
                //左连接
                if (cacheValue != null) {
                    //如果小数据集中有key就连接
                    context.write(new IntWritable(provinceId), new Text(resultVales + "\t" + cacheValue));
                } else {
                    //如果小数据中没有，就赋值为Null
                    context.write(new IntWritable(provinceId), new Text(resultVales + "\t"));
                }
            }
        }
    }
}

鸣宇淳

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
【十八掌●武功篇】第七掌：MapReduce之join详解

这一篇博文是【大数据技术●降龙十八掌】系列文章的其中一篇，点击查看目录：大数据技术●降龙十八掌一、Reduce Joinreduce端的join操作大概是所有mapreduce join操作中最简单的一种，它通过某一个外键将多个数据集连接起来，可以非常容易地实现inner join、left join、right join、full join，并且它对参与连接的数据集大小没有限制，
复制链接

扫一扫

专栏目录