Hadoop-MR join 案例分析

最新推荐文章于 2023-01-15 08:09:15 发布

微毂

最新推荐文章于 2023-01-15 08:09:15 发布

阅读量190

点赞数

分类专栏： hadoop 大数据文章标签： hadoop 大数据

本文链接：https://blog.csdn.net/weixin_42749734/article/details/112748668

版权

大数据同时被 2 个专栏收录

51 篇文章 2 订阅

订阅专栏

hadoop

6 篇文章 0 订阅

订阅专栏

1 MR Reduce端join

在这里插入图片描述
代码实现
Bean

package com.lagou.join;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class JoinBean implements Writable {

    private String userId;
    private String positionId;
    private String date;
    private String positionName;
    // 判断是投递数据，还是职位数据
    private String flag;

    public JoinBean() {
    }

    public JoinBean(String userId, String positionId, String date, String positionName, String flag) {
        this.userId = userId;
        this.positionId = positionId;
        this.date = date;
        this.positionName = positionName;
        this.flag = flag;
    }

    public String getUserId() {
        return userId;
    }

    public void setUserId(String userId) {
        this.userId = userId;
    }

    public String getPositionId() {
        return positionId;
    }

    public void setPositionId(String positionId) {
        this.positionId = positionId;
    }

    public String getDate() {
        return date;
    }

    public void setDate(String date) {
        this.date = date;
    }

    public String getPositionName() {
        return positionName;
    }

    public void setPositionName(String positionName) {
        this.positionName = positionName;
    }

    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(userId);
        out.writeUTF(positionId);
        out.writeUTF(date);
        out.writeUTF(positionName);
        out.writeUTF(flag);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.userId=in.readUTF();
        this.positionId=in.readUTF();
        this.date=in.readUTF();
        this.positionName=in.readUTF();
        this.flag =in.readUTF();
    }

    @Override
    public String toString() {
        return
                 userId +
                "\t" + positionId +
                ",\t" + date +
                "\t" + positionName +
                "\t" + flag ;
    }
}

Mapper

package com.lagou.join;

import jdk.nashorn.internal.ir.CallNode;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class JoinMapper extends Mapper<LongWritable, Text,Text,JoinBean> {


    String name="";
    Text k=new Text();
    JoinBean bean = new JoinBean();
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        InputSplit inputSplit = context.getInputSplit();
        FileSplit split=(FileSplit)inputSplit;
        name=split.getPath().getName();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] arr = line.split("\t");


        if (name.startsWith("deliver_info")){

            bean.setUserId(arr[0]);
            bean.setPositionId(arr[1]);
            bean.setDate(arr[2]);

            // 先置为空
            bean.setPositionName("");
            bean.setFlag("deliver");
        }else {

            bean.setUserId("");
            bean.setPositionId(arr[0]);
            bean.setDate("");

            // 先置为空
            bean.setPositionName(arr[1]);
            bean.setFlag("position");

        }
        k.set(bean.getPositionId());

        context.write(k,bean);

    }
}

Reduce

package com.lagou.join;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

public class JoinReduce extends Reducer<Text,JoinBean,JoinBean, NullWritable> {

    @Override
    protected void reduce(Text key, Iterable<JoinBean> values, Context context) throws IOException, InterruptedException {

        // 相同position的bean对象放在一起了（1个职位，多个投递行为数据）
        ArrayList<JoinBean> joinBean=new ArrayList<>();

        JoinBean positionBean = new JoinBean();

        for (JoinBean bean : values) {
            String flag = bean.getFlag();

            if(flag.equalsIgnoreCase("deliver")){
                // 投递行为数据
                // 此处不能直接把bean对象添加到JoinBean，需要深度拷贝
                JoinBean newBean = new JoinBean();
                try {
                    BeanUtils.copyProperties(newBean,bean);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
            }else{
                // 职位
                try {
                    BeanUtils.copyProperties(positionBean,bean);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
            }
        }

        // 遍历投递行为数据并拼接positionname
        for (JoinBean bean : joinBean) {
            bean.setPositionName(positionBean.getPositionName());
            context.write(bean,NullWritable.get());
        }

    }
}

driver

package com.lagou.join;

import com.lagou.homework.HMapper;
import com.lagou.homework.HReducer;
import com.lagou.homework.Hdriver2;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class JoinDriver {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration, "JoinDriver");

        job.setJarByClass(JoinDriver.class);
        job.setMapperClass(JoinMapper.class);
        job.setReducerClass(JoinReduce.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(JoinBean.class);

        job.setOutputKeyClass(JoinBean.class);
        job.setOutputValueClass(NullWritable.class);



        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));


        boolean flag = job.waitForCompletion(true);

        System.out.println(flag?0:1);

    }
}

Reduce端join的缺点

缺点：

数据聚合功能是在Reduce端完成，Reduce并行度一般不高，所以执行效率存在隐患
相同position的数据去完同一分区。如果数据本身存在不平衡，会造成大数据中最常见的一个问题：数据倾斜问题
解决：使用Map端join实现

2 MR map端join

在这里插入图片描述

使用map端join完成投递行为与职位数据的关联
map端缓存所有职位数据
map方法读取的文件数据是投递行为数据
基于投递行为数据的positionid去缓存中查询出positionname，输出即可
这个job中无需reducetask，setnumreducetask为0


package com.lagou.mapjoin;

import jdk.nashorn.internal.ir.CallNode;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;

public class MapJoinMapper extends Mapper<LongWritable, Text,Text, NullWritable> {

    Text k=new Text();
    HashMap<String,String> map=new HashMap<>();
    // 加载职位数据
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {

        // 读取缓存数据
        InputStreamReader inputStreamReader = new InputStreamReader(new FileInputStream("position.txt"), "UTF-8");
        BufferedReader reader = new BufferedReader(inputStreamReader);
        // 读取职位数据，解析为k，v类型（hashmap）；key：positionid，value：positionname
        String line;
        while(StringUtils.isNoneEmpty(line=reader.readLine())){
            String[] fields = line.split("\t");
            map.put(fields[0],fields[1]);

        }
        reader.lines();

    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] arr = line.split("\t");


        // 都是投递行为数据
        String positionName = map.get(arr[1]);
        k.set(line+"\t"+positionName);

        context.write(k,NullWritable.get());

    }
}

package com.lagou.mapjoin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.net.URI;

public class MapJoinDriver {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration, "MapJoinDriver");

        job.setJarByClass(MapJoinDriver.class);
        job.setMapperClass(MapJoinMapper.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);



        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        // 设置加载缓存文件
        job.addCacheFile(URI.create("file:///E:/cache/position.txt"));
        // setNumReduceTasks为0
        job.setNumReduceTasks(0);

        boolean flag = job.waitForCompletion(true);

        System.out.println(flag?0:1);

    }
}

map端join避免了Reduce端数据倾斜的问题

3 数据倾斜解决方案

通用解决方法

对key增加随机数
以MR为例
第一阶段：对key增加随机数
第二阶段：去掉key的随机数

job2：增加一个随机数据，分成3个输出

在这里插入图片描述
job2 输出三个文件

job2 输出的三个结果，比如a在三个文件都有出现
接下来job3要读取这三个数据，合并类似a这样的三个文本数据汇总到一个结果，此时a只有三条数据，相对其他一条数据，就不会数据倾斜
在这里插入图片描述

微毂

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
Hadoop-MR join 案例分析

1 MR Reduce端join代码实现Beanpackage com.lagou.join;import org.apache.hadoop.io.Writable;import java.io.DataInput;import java.io.DataOutput;import java.io.IOException;public class JoinBean implements Writable { private String userId; privat
复制链接

扫一扫