CC00040.hadoop——|Hadoop&MapReduce.V13|——|Hadoop.v13|MR reduce端join|

该博客介绍了如何使用Hadoop MapReduce在Reduce端实现Join操作,以处理两个大数据文件。通过创建一个MapReduce项目,定义Mapper、Reducer和Driver类,将关联条件作为Mapper输出的key,并在Reducer中合并来自不同源的数据。尽管这种方法在Reduce阶段处理压力较大,但能有效地处理大数据集的Join问题。
摘要由CSDN通过智能技术生成
一、MR reduce端join
### --- MR reduce端join

~~~     [Reduce端Join需求分析] 
~~~     [Reduce端Join-Mapper&Bean对象]  
~~~     [Reduce端Join-Reducer&Driver实现]                                                                            
~~~     [Reduce端Join-程序验证]
### --- 需求分析

~~~     # 需求:
~~~     投递行为数据表deliver_info:
~~~     假如数据量巨大,两表的数据是以文件的形式存储在HDFS中,
~~~     需要用mapreduce程序来实现一下SQL查询运算
userIdpositionIddate
10011777254222020-01-03
10021777254222020-01-04
10021777254332020-01-03
职位表position
idpositionName
177725422产品经理
177725433大数据开发工程师
二、代码实现
### --- 通过将关联的条件作为map输出的key,
~~~     将两表满足join条件的数据并携带数据所来源的文件信息,
~~~     发往同一个reduce task,在reduce中进行数据的串联
### --- 创建项目:reduce_join
### --- Driver

package com.yanqi.mr.reduce_join;

import com.yanqi.mr.wc.WordCountCombiner;
import com.yanqi.mr.wc.WordCountDriver;
import com.yanqi.mr.wc.WordCountMapper;
import com.yanqi.mr.wc.WordCountReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class ReduceJoinDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

//        1. 获取配置文件对象,获取job对象实例
        final Configuration conf = new Configuration();

        final Job job = Job.getInstance(conf, "ReduceJoinDriver");
//        2. 指定程序jar的本地路径
        job.setJarByClass(ReduceJoinDriver.class);
//        3. 指定Mapper/Reducer类
        job.setMapperClass(ReduceJoinMapper.class);
        job.setReducerClass(ReduceJoinReducer.class);
//        4. 指定Mapper输出的kv数据类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(DeliverBean.class);
//        5. 指定最终输出的kv数据类型
        job.setOutputKeyClass(DeliverBean.class);
        job.setOutputValueClass(NullWritable.class);


        FileInputFormat.setInputPaths(job, new Path(args[0])); //指定读取数据的原始路径
//        7. 指定job输出结果路径
        FileOutputFormat.setOutputPath(job, new Path(args[1])); //指定结果数据输出路径
//        8. 提交作业
        final boolean flag = job.waitForCompletion(true);
        //jvm退出:正常退出0,非0值则是错误退出
        System.exit(flag ? 0 : 1);
    }
}
### --- Mapper

package com.yanqi.mr.reduce_join;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

//输出kv类型:k: positionId,v: deliverBean
public class ReduceJoinMapper extends Mapper<LongWritable, Text, Text, DeliverBean> {
    String name = "";
    Text k = new Text();
    //读取的是投递行为数据
    DeliverBean bean = new DeliverBean();

    //map任务启动时初始化执行一次
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        InputSplit inputSplit = context.getInputSplit();
        FileSplit split = (FileSplit) inputSplit;
        name = split.getPath().getName();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] arr = line.split("\t");

        if (name.startsWith("deliver_info")) {
            //读取的是投递行为数据
            bean.setUserId(arr[0]);
            bean.setPositionId(arr[1]);
            bean.setDate(arr[2]);
            //先把空属性置为字符串空
            bean.setPositionName("");
            bean.setFlag("deliver");
        } else {

            bean.setUserId("");
            bean.setPositionId(arr[0]);
            bean.setDate("");
            //先把空属性置为字符串空
            bean.setPositionName(arr[1]);
            bean.setFlag("position");
        }
        k.set(bean.getPositionId());
        context.write(k, bean);
    }
}
### --- Reducer

package com.yanqi.mr.reduce_join;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

public class ReduceJoinReducer extends Reducer<Text, DeliverBean, DeliverBean, NullWritable> {

    @Override
    protected void reduce(Text key, Iterable<DeliverBean> values, Context context) throws IOException,
            InterruptedException {
        //相同positionid的 bean对象放到一起(1个职位数据,n个投递行为数据)
        ArrayList<DeliverBean> deBeans = new ArrayList<>();
        DeliverBean positionBean = new DeliverBean();
        for (DeliverBean bean : values) {
            String flag = bean.getFlag();
            if (flag.equalsIgnoreCase("deliver")) {
                //投递行为数据
                //此处不能直接把bean对象添加到debeans中,需要深度拷贝才行
                DeliverBean newBean = new DeliverBean();
                try {
                    BeanUtils.copyProperties(newBean, bean);
                    deBeans.add(newBean);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
            } else {
                //职位
                try {
                    BeanUtils.copyProperties(positionBean, bean);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
            }
        }

        //遍历投递行为数据拼接positionname
        for (DeliverBean bean : deBeans) {
            bean.setPositionName(positionBean.getPositionName());
            context.write(bean, NullWritable.get());
        }
    }
}
### --- Bean

package com.yanqi.mr.reduce_join;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class DeliverBean implements Writable {

    private String userId;
    private String positionId;
    private String date;
    private String positionName;
    //判断是投递数据还是职位数据标识
    private String flag;

    public DeliverBean() {
    }

    public String getUserId() {
        return userId;
    }

    public void setUserId(String userId) {
        this.userId = userId;
    }

    public String getPositionId() {
        return positionId;
    }

    public void setPositionId(String positionId) {
        this.positionId = positionId;
    }

    public String getDate() {
        return date;
    }

    public void setDate(String date) {
        this.date = date;
    }

    public String getPositionName() {
        return positionName;
    }

    public void setPositionName(String positionName) {
        this.positionName = positionName;
    }

    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(userId);
        out.writeUTF(positionId);
        out.writeUTF(date);
        out.writeUTF(positionName);
        out.writeUTF(flag);

    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.userId = in.readUTF();
        this.positionId = in.readUTF();
        this.date = in.readUTF();
        this.positionName = in.readUTF();
        this.flag = in.readUTF();
    }

    @Override
    public String toString() {
        return "DeliverBean{" +
                "userId='" + userId + '\'' +
                ", positionId='" + positionId + '\'' +
                ", date='" + date + '\'' +
                ", positionName='" + positionName + '\'' +
                ", flag='" + flag + '\'' +
                '}';
    }
}
### --- 缺点:

~~~     这种方式中,join的操作是在reduce阶段完成,reduce端的处理压力太大,
~~~     map节点的运算负载则很低,资源利用率不高,且在reduce阶段极易产生数据倾斜
三、编译打印
### --- 编译打印

~~~     设置打印输入输出参数
~~~     编译打印
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

yanqi_vip

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值