MapReduce高级案例⑧

最新推荐文章于 2021-11-26 21:12:55 发布

kamisamak

最新推荐文章于 2021-11-26 21:12:55 发布

阅读量157

点赞数

文章标签： mapreduce 大数据 hadoop spark java

本文链接：https://blog.csdn.net/qq_33887096/article/details/114532700

版权

MapReduce中多表合并案例

需求分析

订单数据order.txt

1001    01    1
1002    02    2
1003    03    3
1004    01    4
1005    02    5
1006    03    6

商品数据pd.txt

01    小米
02    华为
03    格力

将商品信息表中数据根据商品pid合并到订单数据表中。

code01(有数据倾斜风险)

通过将关联条件作为map输出的key，将两表满足join条件的数据并携带数据所来源的文件信息，发往同一个reduce task，在reduce中进行数据的串联。
这种方式中，合并的操作是在reduce阶段完成，reduce端的处理压力太大，map节点的运算负载则很低，资源利用率不高，且在reduce阶段极易产生数据倾斜

ruaDriver

package com.kami.demo03;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * @version v 1.0
 * @Author kamisamak
 * @Date 2020/6/17
 */
public class ruaDriver {
    public static void main(String[] args) throws Exception, IOException {

        args = new String[] { "data\\d03", "output\\d03" };

        // 1 获取配置信息，或者job对象实例
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);

        // 2 指定本程序的jar包所在的本地路径
        job.setJarByClass(ruaDriver.class);

        // 3 指定本业务job要使用的mapper/Reducer业务类
        job.setMapperClass(TableMapper.class);
        job.setReducerClass(TableReducer.class);

        // 4 指定mapper输出数据的kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(TableBean.class);

        // 5 指定最终输出的数据的kv类型
        job.setOutputKeyClass(TableBean.class);
        job.setOutputValueClass(NullWritable.class);

        // 6 指定job的输入原始文件所在目录
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // 7 将job中配置的相关参数，以及job所用的java类所在的jar包， 提交给yarn去运行
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }
}

TableBean

package com.kami.demo03;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * @version v 1.0
 * @Author kamisamak
 * @Date 2020/6/17
 */
public class TableBean implements Writable {
    // 订单id
    private String order_id;
    // 产品id
    private String p_id;
    // 产品数量
    private int amount;
    // 产品名称
    private String pname;
    // 表的标记
    private String flag;

    @Override
    public String toString() {
        return "TableBean{" +
                "order_id='" + order_id + '\'' +
                ", p_id='" + p_id + '\'' +
                ", amount=" + amount +
                ", pname='" + pname + '\'' +
                ", flag='" + flag + '\'' +
                '}';
    }

    public TableBean() {
        super();
    }

    public TableBean(String order_id, String p_id, int amount, String pname, String flag) {
        super();
        this.order_id = order_id;
        this.p_id = p_id;
        this.amount = amount;
        this.pname = pname;
        this.flag = flag;
    }

    public String getOrder_id() {
        return order_id;
    }

    public void setOrder_id(String order_id) {
        this.order_id = order_id;
    }

    public String getP_id() {
        return p_id;
    }

    public void setP_id(String p_id) {
        this.p_id = p_id;
    }

    public int getAmount() {
        return amount;
    }

    public void setAmount(int amount) {
        this.amount = amount;
    }

    public String getPname() {
        return pname;
    }

    public void setPname(String pname) {
        this.pname = pname;
    }

    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(order_id);
        out.writeUTF(p_id);
        out.writeInt(amount);
        out.writeUTF(pname);
        out.writeUTF(flag);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.order_id = in.readUTF();
        this.p_id = in.readUTF();
        this.amount = in.readInt();
        this.pname = in.readUTF();
        this.flag = in.readUTF();
    }

}

TableMapper

package com.kami.demo03;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

/**
 * @version v 1.0
 * @Author kamisamak
 * @Date 2020/6/17
 */
public class TableMapper extends Mapper
        
        
         
          {

    TableBean bean = new TableBean();
    Text k = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

        // 获取输入文件类型
        FileSplit split = (FileSplit) context.getInputSplit();
        String name = split.getPath().getName();

        // 获取输入数据
        String line = value.toString();

        // 不同文件分别处理
        if (name.startsWith("order")) {// 订单表处理
            // 切割
            String[] fields = line.split("\t");

            // 封装bean对象
            bean.setOrder_id(fields[0]);
            bean.setP_id(fields[1]);
            bean.setAmount(Integer.parseInt(fields[2]));
            bean.setPname("");
            bean.setFlag("0");

            k.set(fields[1]);
        } else {
            // 产品表处理
            // 切割
            String[] fields = line.split("\t");
            // 封装bean对象
            bean.setP_id(fields[0]);
            bean.setPname(fields[1]);
            bean.setFlag("1");
            bean.setAmount(0);
            bean.setOrder_id("");

            k.set(fields[0]);
        }
        // 写出
        context.write(k, bean);
    }
}

TableReducer

package com.kami.demo03;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.ArrayList;

import static org.apache.commons.beanutils.BeanUtils.copyProperties;

/**
 * @version v 1.0
 * @Author kamisamak
 * @Date 2020/6/17
 */
public class TableReducer extends Reducer
        
        
         
          {

    @Override
    protected void reduce(Text key, Iterable
         
         
          
           values, Context context) throws IOException, InterruptedException {
        // 准备存储订单的集合
        ArrayList
          
          
           
            orderBeans = new ArrayList<>();
        // 准备bean对象
        TableBean pdBean = new TableBean();
        for (TableBean bean : values) {
            if ("0".equals(bean.getFlag())) {
                // 订单表
                // 拷贝传递过来的每条订单数据到集合中
                TableBean orderBean = new TableBean();
                try {
                    copyProperties(orderBean, bean);
                } catch (Exception e) {
                    e.printStackTrace();
                }
                orderBeans.add(orderBean);
            } else {
                // 产品表
                try {
                    // 拷贝传递过来的产品表到内存中
                    copyProperties(pdBean, bean);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }

        for(TableBean bean:orderBeans){
            bean.setPname(pdBean.getPname());
            context.write(bean, NullWritable.get());
        }
    }
}

map端表合并(Distributedcache)

适用于关联表中有小表的情形；
可以将小表分发到所有的map节点，这样，map节点就可以在本地对自己所读到的大表数据进行合并并输出最终结果，可以大大提高合并操作的并发度，加快处理速度。

ruaDriver

package com.kami.demo04;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * @version v 1.0
 * @Author kamisamak
 * @Date 2020/6/17
 */
public class ruaDriver {
    public static void main(String[] args) throws Exception {
        args = new String[]{"data\\d03\\order.txt","output\\d04"};
        //获取job信息
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);
        //设置加载jar包路径
        job.setJarByClass(ruaDriver.class);
        //关联map
        job.setMapperClass(DistributedCacheMapper.class);
        //设置最终输出数据类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
        //设置输入输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        //加载缓存数据
//        job.addCacheFile(new URI("file:///e:/inputcache/pd.txt"));
//        DistributedCache.addCacheFile(new URI("file:///C:/tool/dev/JAVA/2020.06/day0616_work01/data/d03/pd.txt"),configuration);
//        DistributedCache.addLocalFiles(configuration,"data/d03/pd.txt");
        //map端join的逻辑不需要reduce阶段，设置reducetask数量为0
        job.setNumReduceTasks(0);
        //提交
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }
}

DistributedCacheMapper

package com.kami.demo04;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;

/**
 * @version v 1.0
 * @Author kamisamak
 * @Date 2020/6/17
 * 读取缓存文件数据
 */
public class DistributedCacheMapper extends Mapper
       
       
        
         {

    Map
        
        
         
          pdMap = new HashMap<>();

    @Override
    protected void setup(Mapper
         
         
          
          .Context context) throws IOException, InterruptedException {
//        context.getCacheFiles()
        //获取缓存的文件
//        URI[] cacheFiles = DistributedCache.getCacheFiles(context.getConfiguration());
        System.out.println(context.getCacheFiles().length);
        BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream("data\\d03\\pd.txt"), "UTF-8"));
        String line;
        while (StringUtils.isNotEmpty(line = reader.readLine())) {
            //切割
            String[] fields = line.split("\t");
            //缓存数据到集合
            pdMap.put(fields[0], fields[1]);
        }
        //关流
        reader.close();
    }

    Text k = new Text();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //获取一行
        String line = value.toString();
        //截取
        String[] fields = line.split("\t");
        //获取产品id
        String pId = fields[1];
        //获取商品名称
        String pdName = pdMap.get(pId);
        //拼接
        k.set(line + "\t" + pdName);
        //写出
        context.write(k, NullWritable.get());
    }
}

from:https://www.cnblogs.com/frankdeng/p/9256248.html