Hadoop-MR join 案例分析

1 MR Reduce端join

在这里插入图片描述
代码实现
Bean

package com.lagou.join;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class JoinBean implements Writable {

    private String userId;
    private String positionId;
    private String date;
    private String positionName;
    // 判断是投递数据,还是职位数据
    private String flag;

    public JoinBean() {
    }

    public JoinBean(String userId, String positionId, String date, String positionName, String flag) {
        this.userId = userId;
        this.positionId = positionId;
        this.date = date;
        this.positionName = positionName;
        this.flag = flag;
    }

    public String getUserId() {
        return userId;
    }

    public void setUserId(String userId) {
        this.userId = userId;
    }

    public String getPositionId() {
        return positionId;
    }

    public void setPositionId(String positionId) {
        this.positionId = positionId;
    }

    public String getDate() {
        return date;
    }

    public void setDate(String date) {
        this.date = date;
    }

    public String getPositionName() {
        return positionName;
    }

    public void setPositionName(String positionName) {
        this.positionName = positionName;
    }

    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(userId);
        out.writeUTF(positionId);
        out.writeUTF(date);
        out.writeUTF(positionName);
        out.writeUTF(flag);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.userId=in.readUTF();
        this.positionId=in.readUTF();
        this.date=in.readUTF();
        this.positionName=in.readUTF();
        this.flag =in.readUTF();
    }

    @Override
    public String toString() {
        return
                 userId +
                "\t" + positionId +
                ",\t" + date +
                "\t" + positionName +
                "\t" + flag ;
    }
}

Mapper

package com.lagou.join;

import jdk.nashorn.internal.ir.CallNode;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class JoinMapper extends Mapper<LongWritable, Text,Text,JoinBean> {


    String name="";
    Text k=new Text();
    JoinBean bean = new JoinBean();
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        InputSplit inputSplit = context.getInputSplit();
        FileSplit split=(FileSplit)inputSplit;
        name=split.getPath().getName();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] arr = line.split("\t");


        if (name.startsWith("deliver_info")){

            bean.setUserId(arr[0]);
            bean.setPositionId(arr[1]);
            bean.setDate(arr[2]);

            // 先置为空
            bean.setPositionName("");
            bean.setFlag("deliver");
        }else {

            bean.setUserId("");
            bean.setPositionId(arr[0]);
            bean.setDate("");

            // 先置为空
            bean.setPositionName(arr[1]);
            bean.setFlag("position");

        }
        k.set(bean.getPositionId());

        context.write(k,bean);

    }
}

Reduce

package com.lagou.join;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

public class JoinReduce extends Reducer<Text,JoinBean,JoinBean, NullWritable> {

    @Override
    protected void reduce(Text key, Iterable<JoinBean> values, Context context) throws IOException, InterruptedException {

        // 相同position的bean对象放在一起了(1个职位,多个投递行为数据)
        ArrayList<JoinBean> joinBean=new ArrayList<>();

        JoinBean positionBean = new JoinBean();

        for (JoinBean bean : values) {
            String flag = bean.getFlag();

            if(flag.equalsIgnoreCase("deliver")){
                // 投递行为数据
                // 此处不能直接把bean对象添加到JoinBean,需要深度拷贝
                JoinBean newBean = new JoinBean();
                try {
                    BeanUtils.copyProperties(newBean,bean);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
            }else{
                // 职位
                try {
                    BeanUtils.copyProperties(positionBean,bean);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
            }
        }

        // 遍历投递行为数据并拼接positionname
        for (JoinBean bean : joinBean) {
            bean.setPositionName(positionBean.getPositionName());
            context.write(bean,NullWritable.get());
        }

    }
}

driver

package com.lagou.join;

import com.lagou.homework.HMapper;
import com.lagou.homework.HReducer;
import com.lagou.homework.Hdriver2;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class JoinDriver {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration, "JoinDriver");

        job.setJarByClass(JoinDriver.class);
        job.setMapperClass(JoinMapper.class);
        job.setReducerClass(JoinReduce.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(JoinBean.class);

        job.setOutputKeyClass(JoinBean.class);
        job.setOutputValueClass(NullWritable.class);



        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));


        boolean flag = job.waitForCompletion(true);

        System.out.println(flag?0:1);

    }
}

Reduce端join的缺点

缺点:

  • 数据聚合功能是在Reduce端完成,Reduce并行度一般不高,所以执行效率存在隐患
  • 相同position的数据去完同一分区。如果数据本身存在不平衡,会造成大数据中最常见的一个问题:数据倾斜问题
    解决:使用Map端join实现

2 MR map端join

在这里插入图片描述

  • 使用map端join完成投递行为与职位数据的关联
  • map端缓存所有职位数据
  • map方法读取的文件数据是投递行为数据
  • 基于投递行为数据的positionid去缓存中查询出positionname,输出即可
  • 这个job中无需reducetask,setnumreducetask为0

package com.lagou.mapjoin;

import jdk.nashorn.internal.ir.CallNode;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;

public class MapJoinMapper extends Mapper<LongWritable, Text,Text, NullWritable> {

    Text k=new Text();
    HashMap<String,String> map=new HashMap<>();
    // 加载职位数据
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {

        // 读取缓存数据
        InputStreamReader inputStreamReader = new InputStreamReader(new FileInputStream("position.txt"), "UTF-8");
        BufferedReader reader = new BufferedReader(inputStreamReader);
        // 读取职位数据,解析为k,v类型(hashmap);key:positionid,value:positionname
        String line;
        while(StringUtils.isNoneEmpty(line=reader.readLine())){
            String[] fields = line.split("\t");
            map.put(fields[0],fields[1]);

        }
        reader.lines();

    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] arr = line.split("\t");


        // 都是投递行为数据
        String positionName = map.get(arr[1]);
        k.set(line+"\t"+positionName);

        context.write(k,NullWritable.get());

    }
}

package com.lagou.mapjoin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.net.URI;

public class MapJoinDriver {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration, "MapJoinDriver");

        job.setJarByClass(MapJoinDriver.class);
        job.setMapperClass(MapJoinMapper.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);



        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        // 设置加载缓存文件
        job.addCacheFile(URI.create("file:///E:/cache/position.txt"));
        // setNumReduceTasks为0
        job.setNumReduceTasks(0);

        boolean flag = job.waitForCompletion(true);

        System.out.println(flag?0:1);

    }
}

map端join避免了Reduce端数据倾斜的问题

3 数据倾斜解决方案

通用解决方法

对key增加随机数
以MR为例
第一阶段:对key增加随机数
第二阶段:去掉key的随机数

job2:增加一个随机数据,分成3个输出

在这里插入图片描述
job2 输出三个文件
在这里插入图片描述
job2 输出的三个结果,比如a在三个文件都有出现
接下来job3要读取这三个数据,合并类似a这样的三个文本数据汇总到一个结果,此时a只有三条数据,相对其他一条数据,就不会数据倾斜
在这里插入图片描述

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值