MapReduce中多表合并案例

佳讯好

于 2024-03-25 22:25:21 发布

阅读量377

点赞数 4

分类专栏：大数据离线开发文章标签：数据库大数据分布式 mapreduce hadoop

本文链接：https://blog.csdn.net/qq_15304885/article/details/137027954

版权

大数据同时被 2 个专栏收录

14 篇文章 0 订阅

订阅专栏

离线开发

14 篇文章 0 订阅

订阅专栏

1）需求：

订单数据表t_order：

id	pid	amount
1001	01	1
1002	02	2
1003	03	3

商品信息表t_product

pid	pname
01	小米
02	华为
03	格力

将商品信息表中数据根据商品pid合并到订单数据表中。

最终数据形式：

id	pname	amount
1001	小米	1
1004	小米	4
1002	华为	2
1005	华为	5
1003	格力	3
1006	格力	6

需求1：Reduce端表合并（数据倾斜）

通过将关联条件作为map输出的key，将两表满足join条件的数据并携带数据所来源的文件信息，发往同一个reduce task，在reduce中进行数据的串联。

1）创建商品和订合并后的bean类

package bigdata.b14.b141;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;


//Reduce端表合并（数据倾斜）
public class TableBean implements Writable {
    private String order_id;//订单id
    private String p_id;//产品id
    private int amount;//产品数量
    private String pname;//产品名称
    private String flag;//表的标记
    //空参构造器，反序列化需要
    public TableBean() {
    }

    public TableBean(String order_id, String p_id, int amount, String pname, String flag) {
        this.order_id = order_id;
        this.p_id = p_id;
        this.amount = amount;
        this.pname = pname;
        this.flag = flag;
    }

    public String getOrder_id() {
        return order_id;
    }

    public void setOrder_id(String order_id) {
        this.order_id = order_id;
    }

    public String getP_id() {
        return p_id;
    }

    public void setP_id(String p_id) {
        this.p_id = p_id;
    }

    public int getAmount() {
        return amount;
    }

    public void setAmount(int amount) {
        this.amount = amount;
    }

    public String getPname() {
        return pname;
    }

    public void setPname(String pname) {
        this.pname = pname;
    }

    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }


    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(order_id);
        dataOutput.writeUTF(p_id);
        dataOutput.writeInt(amount);
        dataOutput.writeUTF(pname);
        dataOutput.writeUTF(flag);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        this.order_id = dataInput.readUTF();
        this.p_id= dataInput.readUTF();
        this.amount=dataInput.readInt();
        this.pname=dataInput.readUTF();
        this.flag=dataInput.readUTF();
    }

    @Override
    public String toString() {
        return order_id+"\t"+pname+"\t"+amount+"\t";
    }
}

2）编写TableMapper程序

package bigdata.b14.b141;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

public class TableMapper extends Mapper<LongWritable, Text, Text, TableBean> {
    TableBean bean = new TableBean();
    Text k = new Text();

    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, TableBean>.Context context) throws IOException, InterruptedException {
        //1、获取输入文件类型
        FileSplit split = (FileSplit) context.getInputSplit();
        String name = split.getPath().getName();

        //2、获取输入数据
        String inputdata = value.toString();

        //3、不同文件分别处理
        if (name.startsWith("order")) {
            //订单表处理
            //3.1 切割
            String[] splitdata = inputdata.split("\t");

            //3.2 封装bean对象
            bean.setOrder_id(splitdata[0]);
            bean.setP_id(splitdata[1]);
            bean.setAmount(Integer.parseInt(splitdata[2]));
            bean.setPname("");
            bean.setFlag("0");

            k.set(splitdata[1]);//这里的splitdata是text类型
        } else {
            //产品表处理
            //3.3 切割
            String[] splitdata = inputdata.split("\t");

            //3.4 封装bean对象
            bean.setP_id(splitdata[0]);
            bean.setPname(splitdata[1]);
            bean.setFlag("1");
            bean.setAmount(0);
            bean.setOrder_id("");
            k.set(splitdata[0]);//这个（）里是string类型
        }
        //输出
        context.write(k,bean);
    }
}

3）编写TableReducer程序

package bigdata.b14.b141;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

public class TableReduce extends Reducer<Text, TableBean, TableBean, NullWritable> {
    @Override
    protected void reduce(Text key, Iterable<TableBean> values, Reducer<Text, TableBean, TableBean, NullWritable>.Context context) throws IOException, InterruptedException {
        //1 准备存储订单的集合
        ArrayList<TableBean> orderBeans = new ArrayList<>();
        //2 准备bean对象
        TableBean itembean = new TableBean();

        for (TableBean beans : values) {
            if ("0".equals(beans.getFlag())) {
                //订单表
                //拷贝传递过来的每条订单数据到集合中
                TableBean orderbean = new TableBean();
                try {
                    BeanUtils.copyProperties(orderbean, beans);//这个语句的作用是什么？把beans拷贝到orderbean中
                } catch (Exception e) {
                    e.printStackTrace();
                }
                orderBeans.add(orderbean);

            }else {
                //产品表
                try {
                    BeanUtils.copyProperties(itembean,beans);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }
        //3 表的拼接
        for (TableBean bean:orderBeans){
            bean.setPname(itembean.getPname());
            //4 输出数据
            context.write(bean,null);
        }
    }
}

4）编写TableDriver程序

package bigdata.b14.b141;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class TableDriver {
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        //1 获取配置信息，或者job对象实例
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        //2 指定本程序的jar包所在的本地路径
        job.setJarByClass(TableDriver.class);

        //3 指定本业务job要使用的mapper和reducer业务类
        job.setMapperClass(TableMapper.class);
        job.setReducerClass(TableReduce.class);

        //4 指定Mapper输出数据的kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(TableBean.class);

        //5 指定最终输出的数据的kv类型
        job.setOutputKeyClass(TableBean.class);
        job.setOutputValueClass(NullWritable.class);

        //6 指定job的输入原始文件所在目录
        args = new String[]{"D:\\test\\b14","D:\\test\\b14\\b141"};
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        //7 将job中配置的相关参数，以及job所用的java类所在的jar包，提交给yarn去运行
        boolean result = job.waitForCompletion(true);
        System.exit(result?0:1);

    }
}

5）运行程序查看结果

缺点：这种方式中，合并的操作是在reduce阶段完成，reduce端的处理压力太大，map节点的运算负载则很低，资源利用率不高，且在reduce阶段极易产生数据倾斜

解决方案： map端实现数据合并

需求2：Map端表合并（Distributedcache）

1）分析

适用于关联表中有小表的情形；

可以将小表分发到所有的map节点，这样，map节点就可以在本地对自己所读到的大表数据进行合并并输出最终结果，可以大大提高合并操作的并发度，加快处理速度。

2）实操案例

（1）先读取缓存的文件数据

package bigdata.b14.b142;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;

public class DistributedCacheMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
    //定义一个hashmap集合存储小表的文件
//    Map<String,String> pdMap=new HashMap<>(); 效果相同
    Map pdMap = new HashMap<String,String>();
    //用setup读小表，加载到内存中
    @Override
    protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
        //1 获取缓存的文件
        BufferedReader reader = new BufferedReader(
                //输入流，编码格式
                new InputStreamReader(
                        new FileInputStream("D:\\test\\b14\\pd.txt"), "UTF-8"));//小表
        //一行一行读取数据
        String line;
        //判断是否为空，为空则停止
        while (StringUtils.isNotEmpty(line = reader.readLine())){
            //2 切割
            String[] split = line.split("\t");

            //3 缓存数据存储到pdMap集合
            pdMap.put(split[0],split[1]);
        }
        //4 关流
        reader.close();
    }

//    Text k = new Text();

    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
        //获取大表（订单表）
        //1 获取一行
        String line = value.toString();

        //2 截取
        String[] split = line.split("\t");

        //3 获取产品id
        String pid = split[1];

        //4 获取商品名称
//        String pdName = pdMap.get(pid);

//        //5 拼接
//        k.set(line + "\t" + pdName);
//
//        //6 输出
//        context.write(k, null);

        //判断pdMap里pid是否存在，如果有则输出
        //两表按照pid进行关联
        if (pdMap.containsKey(pid)) {
            context.write(new Text(split[0] + "\t" + pdMap.get(pid) + "\t" + split[2]), null);

        }
    }
}

（2）在驱动模块中添加缓存文件

package bigdata.b14.b142;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class DistributedCacheDriver {
    public static void main(String[] args) throws IOException, URISyntaxException, InterruptedException, ClassNotFoundException {
        args=new String[]{"D:\\test\\b14","D:\\test\\b14\\Distributedcache"};

        //1 获取job信息
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        //2 设置加载jar包路径
        job.setJarByClass(DistributedCacheDriver.class);

        //3 关联map
        job.setMapperClass(DistributedCacheMapper.class);

        //4 设置最终输出数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        //5 设置输入输出路径
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        //6 加载缓存数据
        job.addCacheFile(new URI("file:///D:/test/b14/pd.txt"));

        //7 map端join的逻辑不需要reduce阶段，设置reducetask数量为0
        job.setNumReduceTasks(0);

        //8 提交
        boolean result = job.waitForCompletion(true);
        System.exit(result?0:1);
    }
}

佳讯好

关注

4
点赞
踩
6

收藏

觉得还不错? 一键收藏
0
评论
MapReduce中多表合并案例

节点就可以在本地对自己所读到的大表数据进行合并并输出最终结果，可以大大提高合并操作的并发度，加快处理速度。条件的数据并携带数据所来源的文件信息，发往同一个。节点的运算负载则很低，资源利用率不高，且在。缺点：这种方式中，合并的操作是在。（2）在驱动模块中添加缓存文件。适用于关联表中有小表的情形；将商品信息表中数据根据商品。（1）先读取缓存的文件数据。）创建商品和订合并后的。可以将小表分发到所有的。合并到订单数据表中。5）运行程序查看结果。阶段极易产生数据倾斜。
复制链接

扫一扫