MapReduce中多表合并案例

1)需求:

订单数据表t_order

id

pid

amount

1001

01

1

1002

02

2

1003

03

3

商品信息表t_product

pid

pname

01

小米

02

华为

03

格力

将商品信息表中数据根据商品pid合并到订单数据表中。

最终数据形式:

id

pname

amount

1001

小米

1

1004

小米

4

1002

华为

2

1005

华为

5

1003

格力

3

1006

格力

6

需求1Reduce端表合并(数据倾斜)

通过将关联条件作为map输出的key,将两表满足join条件的数据并携带数据所来源的文件信息,发往同一个reduce task,在reduce中进行数据的串联。

1)创建商品和订合并后的bean

package bigdata.b14.b141;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;


//Reduce端表合并(数据倾斜)
public class TableBean implements Writable {
    private String order_id;//订单id
    private String p_id;//产品id
    private int amount;//产品数量
    private String pname;//产品名称
    private String flag;//表的标记
    //空参构造器,反序列化需要
    public TableBean() {
    }

    public TableBean(String order_id, String p_id, int amount, String pname, String flag) {
        this.order_id = order_id;
        this.p_id = p_id;
        this.amount = amount;
        this.pname = pname;
        this.flag = flag;
    }

    public String getOrder_id() {
        return order_id;
    }

    public void setOrder_id(String order_id) {
        this.order_id = order_id;
    }

    public String getP_id() {
        return p_id;
    }

    public void setP_id(String p_id) {
        this.p_id = p_id;
    }

    public int getAmount() {
        return amount;
    }

    public void setAmount(int amount) {
        this.amount = amount;
    }

    public String getPname() {
        return pname;
    }

    public void setPname(String pname) {
        this.pname = pname;
    }

    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }


    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(order_id);
        dataOutput.writeUTF(p_id);
        dataOutput.writeInt(amount);
        dataOutput.writeUTF(pname);
        dataOutput.writeUTF(flag);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        this.order_id = dataInput.readUTF();
        this.p_id= dataInput.readUTF();
        this.amount=dataInput.readInt();
        this.pname=dataInput.readUTF();
        this.flag=dataInput.readUTF();
    }

    @Override
    public String toString() {
        return order_id+"\t"+pname+"\t"+amount+"\t";
    }
}

2)编写TableMapper程序

package bigdata.b14.b141;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

public class TableMapper extends Mapper<LongWritable, Text, Text, TableBean> {
    TableBean bean = new TableBean();
    Text k = new Text();

    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, TableBean>.Context context) throws IOException, InterruptedException {
        //1、获取输入文件类型
        FileSplit split = (FileSplit) context.getInputSplit();
        String name = split.getPath().getName();

        //2、获取输入数据
        String inputdata = value.toString();

        //3、不同文件分别处理
        if (name.startsWith("order")) {
            //订单表处理
            //3.1 切割
            String[] splitdata = inputdata.split("\t");

            //3.2 封装bean对象
            bean.setOrder_id(splitdata[0]);
            bean.setP_id(splitdata[1]);
            bean.setAmount(Integer.parseInt(splitdata[2]));
            bean.setPname("");
            bean.setFlag("0");

            k.set(splitdata[1]);//这里的splitdata是text类型
        } else {
            //产品表处理
            //3.3 切割
            String[] splitdata = inputdata.split("\t");

            //3.4 封装bean对象
            bean.setP_id(splitdata[0]);
            bean.setPname(splitdata[1]);
            bean.setFlag("1");
            bean.setAmount(0);
            bean.setOrder_id("");
            k.set(splitdata[0]);//这个()里是string类型
        }
        //输出
        context.write(k,bean);
    }
}

3)编写TableReducer程序

package bigdata.b14.b141;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

public class TableReduce extends Reducer<Text, TableBean, TableBean, NullWritable> {
    @Override
    protected void reduce(Text key, Iterable<TableBean> values, Reducer<Text, TableBean, TableBean, NullWritable>.Context context) throws IOException, InterruptedException {
        //1 准备存储订单的集合
        ArrayList<TableBean> orderBeans = new ArrayList<>();
        //2 准备bean对象
        TableBean itembean = new TableBean();

        for (TableBean beans : values) {
            if ("0".equals(beans.getFlag())) {
                //订单表
                //拷贝传递过来的每条订单数据到集合中
                TableBean orderbean = new TableBean();
                try {
                    BeanUtils.copyProperties(orderbean, beans);//这个语句的作用是什么?把beans拷贝到orderbean中
                } catch (Exception e) {
                    e.printStackTrace();
                }
                orderBeans.add(orderbean);

            }else {
                //产品表
                try {
                    BeanUtils.copyProperties(itembean,beans);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }
        //3 表的拼接
        for (TableBean bean:orderBeans){
            bean.setPname(itembean.getPname());
            //4 输出数据
            context.write(bean,null);
        }
    }
}

4)编写TableDriver程序

package bigdata.b14.b141;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class TableDriver {
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        //1 获取配置信息,或者job对象实例
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        //2 指定本程序的jar包所在的本地路径
        job.setJarByClass(TableDriver.class);

        //3 指定本业务job要使用的mapper和reducer业务类
        job.setMapperClass(TableMapper.class);
        job.setReducerClass(TableReduce.class);

        //4 指定Mapper输出数据的kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(TableBean.class);

        //5 指定最终输出的数据的kv类型
        job.setOutputKeyClass(TableBean.class);
        job.setOutputValueClass(NullWritable.class);

        //6 指定job的输入原始文件所在目录
        args = new String[]{"D:\\test\\b14","D:\\test\\b14\\b141"};
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        //7 将job中配置的相关参数,以及job所用的java类所在的jar包,提交给yarn去运行
        boolean result = job.waitForCompletion(true);
        System.exit(result?0:1);

    }
}

5)运行程序查看结果

缺点:这种方式中,合并的操作是在reduce阶段完成,reduce端的处理压力太大,map节点的运算负载则很低,资源利用率不高,且在reduce阶段极易产生数据倾斜

解决方案: map端实现数据合并

需求2Map端表合并(Distributedcache

1)分析

适用于关联表中有小表的情形;

可以将小表分发到所有的map节点,这样,map节点就可以在本地对自己所读到的大表数据进行合并并输出最终结果,可以大大提高合并操作的并发度,加快处理速度。

2)实操案例

(1)先读取缓存的文件数据

package bigdata.b14.b142;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;

public class DistributedCacheMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
    //定义一个hashmap集合存储小表的文件
//    Map<String,String> pdMap=new HashMap<>(); 效果相同
    Map pdMap = new HashMap<String,String>();
    //用setup读小表,加载到内存中
    @Override
    protected void setup(Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
        //1 获取缓存的文件
        BufferedReader reader = new BufferedReader(
                //输入流,编码格式
                new InputStreamReader(
                        new FileInputStream("D:\\test\\b14\\pd.txt"), "UTF-8"));//小表
        //一行一行读取数据
        String line;
        //判断是否为空,为空则停止
        while (StringUtils.isNotEmpty(line = reader.readLine())){
            //2 切割
            String[] split = line.split("\t");

            //3 缓存数据存储到pdMap集合
            pdMap.put(split[0],split[1]);
        }
        //4 关流
        reader.close();
    }

//    Text k = new Text();

    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
        //获取大表(订单表)
        //1 获取一行
        String line = value.toString();

        //2 截取
        String[] split = line.split("\t");

        //3 获取产品id
        String pid = split[1];

        //4 获取商品名称
//        String pdName = pdMap.get(pid);

//        //5 拼接
//        k.set(line + "\t" + pdName);
//
//        //6 输出
//        context.write(k, null);

        //判断pdMap里pid是否存在,如果有则输出
        //两表按照pid进行关联
        if (pdMap.containsKey(pid)) {
            context.write(new Text(split[0] + "\t" + pdMap.get(pid) + "\t" + split[2]), null);

        }
    }
}

(2)在驱动模块中添加缓存文件

package bigdata.b14.b142;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

public class DistributedCacheDriver {
    public static void main(String[] args) throws IOException, URISyntaxException, InterruptedException, ClassNotFoundException {
        args=new String[]{"D:\\test\\b14","D:\\test\\b14\\Distributedcache"};

        //1 获取job信息
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        //2 设置加载jar包路径
        job.setJarByClass(DistributedCacheDriver.class);

        //3 关联map
        job.setMapperClass(DistributedCacheMapper.class);

        //4 设置最终输出数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        //5 设置输入输出路径
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        //6 加载缓存数据
        job.addCacheFile(new URI("file:///D:/test/b14/pd.txt"));

        //7 map端join的逻辑不需要reduce阶段,设置reducetask数量为0
        job.setNumReduceTasks(0);

        //8 提交
        boolean result = job.waitForCompletion(true);
        System.exit(result?0:1);
    }
}

 

  • 4
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值