CC00040.hadoop——|Hadoop&MapReduce.V13|——|Hadoop.v13|MR reduce端join|

yanqi_vip

已于 2022-04-09 23:03:32 修改

阅读量55

点赞数

分类专栏： bigdatav006——hadoop.v001 文章标签： mapreduce hadoop hive 大数据 java

于 2022-04-07 14:09:00 首次发布

不予转载

本文链接：https://blog.csdn.net/yanqi_vip/article/details/124031153

版权

bigdatav006——hadoop.v001 专栏收录该内容

65 篇文章 0 订阅

订阅专栏

该博客介绍了如何使用Hadoop MapReduce在Reduce端实现Join操作，以处理两个大数据文件。通过创建一个MapReduce项目，定义Mapper、Reducer和Driver类，将关联条件作为Mapper输出的key，并在Reducer中合并来自不同源的数据。尽管这种方法在Reduce阶段处理压力较大，但能有效地处理大数据集的Join问题。

摘要由CSDN通过智能技术生成

一、MR reduce端join

### --- MR reduce端join

~~~     [Reduce端Join需求分析] 
~~~     [Reduce端Join-Mapper&Bean对象]  
~~~     [Reduce端Join-Reducer&Driver实现]                                                                            
~~~     [Reduce端Join-程序验证]

### --- 需求分析

~~~     # 需求：
~~~     投递行为数据表deliver_info：
~~~     假如数据量巨大，两表的数据是以文件的形式存储在HDFS中，
~~~     需要用mapreduce程序来实现一下SQL查询运算

userId	positionId	date
1001	177725422	2020-01-03
1002	177725422	2020-01-04
1002	177725433	2020-01-03

职位表position

id	positionName
177725422	产品经理
177725433	大数据开发工程师

二、代码实现

### --- 通过将关联的条件作为map输出的key，
~~~     将两表满足join条件的数据并携带数据所来源的文件信息，
~~~     发往同一个reduce task，在reduce中进行数据的串联
### --- 创建项目：reduce_join
### --- Driver

package com.yanqi.mr.reduce_join;

import com.yanqi.mr.wc.WordCountCombiner;
import com.yanqi.mr.wc.WordCountDriver;
import com.yanqi.mr.wc.WordCountMapper;
import com.yanqi.mr.wc.WordCountReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class ReduceJoinDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

//        1. 获取配置文件对象，获取job对象实例
        final Configuration conf = new Configuration();

        final Job job = Job.getInstance(conf, "ReduceJoinDriver");
//        2. 指定程序jar的本地路径
        job.setJarByClass(ReduceJoinDriver.class);
//        3. 指定Mapper/Reducer类
        job.setMapperClass(ReduceJoinMapper.class);
        job.setReducerClass(ReduceJoinReducer.class);
//        4. 指定Mapper输出的kv数据类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(DeliverBean.class);
//        5. 指定最终输出的kv数据类型
        job.setOutputKeyClass(DeliverBean.class);
        job.setOutputValueClass(NullWritable.class);


        FileInputFormat.setInputPaths(job, new Path(args[0])); //指定读取数据的原始路径
//        7. 指定job输出结果路径
        FileOutputFormat.setOutputPath(job, new Path(args[1])); //指定结果数据输出路径
//        8. 提交作业
        final boolean flag = job.waitForCompletion(true);
        //jvm退出：正常退出0，非0值则是错误退出
        System.exit(flag ? 0 : 1);
    }
}

### --- Mapper

package com.yanqi.mr.reduce_join;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

//输出kv类型：k: positionId,v: deliverBean
public class ReduceJoinMapper extends Mapper<LongWritable, Text, Text, DeliverBean> {
    String name = "";
    Text k = new Text();
    //读取的是投递行为数据
    DeliverBean bean = new DeliverBean();

    //map任务启动时初始化执行一次
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        InputSplit inputSplit = context.getInputSplit();
        FileSplit split = (FileSplit) inputSplit;
        name = split.getPath().getName();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] arr = line.split("\t");

        if (name.startsWith("deliver_info")) {
            //读取的是投递行为数据
            bean.setUserId(arr[0]);
            bean.setPositionId(arr[1]);
            bean.setDate(arr[2]);
            //先把空属性置为字符串空
            bean.setPositionName("");
            bean.setFlag("deliver");
        } else {

            bean.setUserId("");
            bean.setPositionId(arr[0]);
            bean.setDate("");
            //先把空属性置为字符串空
            bean.setPositionName(arr[1]);
            bean.setFlag("position");
        }
        k.set(bean.getPositionId());
        context.write(k, bean);
    }
}

### --- Reducer

package com.yanqi.mr.reduce_join;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

public class ReduceJoinReducer extends Reducer<Text, DeliverBean, DeliverBean, NullWritable> {

    @Override
    protected void reduce(Text key, Iterable<DeliverBean> values, Context context) throws IOException,
            InterruptedException {
        //相同positionid的 bean对象放到一起(1个职位数据，n个投递行为数据)
        ArrayList<DeliverBean> deBeans = new ArrayList<>();
        DeliverBean positionBean = new DeliverBean();
        for (DeliverBean bean : values) {
            String flag = bean.getFlag();
            if (flag.equalsIgnoreCase("deliver")) {
                //投递行为数据
                //此处不能直接把bean对象添加到debeans中，需要深度拷贝才行
                DeliverBean newBean = new DeliverBean();
                try {
                    BeanUtils.copyProperties(newBean, bean);
                    deBeans.add(newBean);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
            } else {
                //职位
                try {
                    BeanUtils.copyProperties(positionBean, bean);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
            }
        }

        //遍历投递行为数据拼接positionname
        for (DeliverBean bean : deBeans) {
            bean.setPositionName(positionBean.getPositionName());
            context.write(bean, NullWritable.get());
        }
    }
}

### --- Bean

package com.yanqi.mr.reduce_join;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class DeliverBean implements Writable {

    private String userId;
    private String positionId;
    private String date;
    private String positionName;
    //判断是投递数据还是职位数据标识
    private String flag;

    public DeliverBean() {
    }

    public String getUserId() {
        return userId;
    }

    public void setUserId(String userId) {
        this.userId = userId;
    }

    public String getPositionId() {
        return positionId;
    }

    public void setPositionId(String positionId) {
        this.positionId = positionId;
    }

    public String getDate() {
        return date;
    }

    public void setDate(String date) {
        this.date = date;
    }

    public String getPositionName() {
        return positionName;
    }

    public void setPositionName(String positionName) {
        this.positionName = positionName;
    }

    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(userId);
        out.writeUTF(positionId);
        out.writeUTF(date);
        out.writeUTF(positionName);
        out.writeUTF(flag);

    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.userId = in.readUTF();
        this.positionId = in.readUTF();
        this.date = in.readUTF();
        this.positionName = in.readUTF();
        this.flag = in.readUTF();
    }

    @Override
    public String toString() {
        return "DeliverBean{" +
                "userId='" + userId + '\'' +
                ", positionId='" + positionId + '\'' +
                ", date='" + date + '\'' +
                ", positionName='" + positionName + '\'' +
                ", flag='" + flag + '\'' +
                '}';
    }
}

### --- 缺点：

~~~     这种方式中，join的操作是在reduce阶段完成，reduce端的处理压力太大，
~~~     map节点的运算负载则很低，资源利用率不高，且在reduce阶段极易产生数据倾斜

三、编译打印

### --- 编译打印

~~~     设置打印输入输出参数
~~~     编译打印

yanqi_vip

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
CC00040.hadoop——|Hadoop&MapReduce.V13|——|Hadoop.v13|MR reduce端join|

一、MR reduce端join### --- MR reduce端join~~~ [Reduce端Join需求分析] ~~~ [Reduce端Join-Mapper&Bean对象] ~~~ [Reduce端Join-Reducer&Driver实现] ...
复制链接

扫一扫