hadoop之Map join和Reduce join (13)

最新推荐文章于 2023-05-09 09:44:02 发布

置顶怒放de生命2010

最新推荐文章于 2023-05-09 09:44:02 发布

阅读量755

点赞数 1

本文链接：https://blog.csdn.net/baidu_21349635/article/details/93197013

版权

大数据专栏收录该内容

32 篇文章 2 订阅

订阅专栏

Map join（Distributedcache分布式缓存）

使用场景

一张表十分小、一张表很大。

解决方案

在map端缓存多张表，提前处理业务逻辑，这样增加map端业务，减少reduce端数据的压力，尽可能的减少数据倾斜。

具体办法：采用distributedcache

1）在mapper的setup阶段，将文件读取到缓存集合中。
2）在驱动函数中加载缓存。
job.addCacheFile(new URI(“file:/e:/mapjoincache/pd.txt”));// 缓存普通文件到task运行节点

数据

order.txt

201801	01	1
201802	02	2
201803	03	3
201804	01	4
201805	02	5
201806	03	6

pd.txt

01	小米
02	华为
03	格力

实例：

目标文件:

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
/*
01	小米
02	华为
03	格力
*/
/**
 * 201801	01	1
 * 201802	02	2
 * 201803	03	3
 * 201804	01	4
 * 201805	02	5
 * 201806	03	6
 */
public class MapJoin extends Mapper<LongWritable, Text, Text, NullWritable> {
    HashMap hashMap = new HashMap<String, String>();

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        //获取缓存的文件（产品表）
        BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream("F:\\input\\pd.txt"), "UTF-8"));
        //获取缓存的文件（产品表）
        //一行一行读取数据
        String line;
        while (StringUtils.isNotEmpty(line = reader.readLine())) {
            //切分  01	小米
            String[] split = line.split("\t");
            //数据存储进集合
            hashMap.put(split[0], split[1]);
        }
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //获取大表（定单表 order.txt）
        String line = value.toString();
        //切分 201901	01	1
        String[] fileds = line.split("\t");
        String pid = fileds[1];
        //进行量表按照pid关联
        if (hashMap.containsKey(pid)) {
            context.write(new Text(fileds[0] + "\t" + hashMap.get(pid) + "\t" + fileds[2]), NullWritable.get());
        }
    }

}

public class  Drive {
    /**
     * 主类
     * @param object 主类
     * @param mymap map类
     * @param mymapkey map输入key
     * @param mymapvalue map输出value
     * @param args1 FileInputFormat输入路径
     * @param args2 FileOutputFormat输出路径
     * @param num reduce个数
     * @param args3 加载缓存的路径
     *            * */
    public static void run(Class<?> object,Class<? extends Mapper> mymap,Class<?> mymapkey,Class<?> mymapvalue,int num,String args1,String args2,String args3) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {

        // 1 获取job信息
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        // 2 加载jar包
        job.setJarByClass(object);

        // 3 关联map和reduce
        job.setMapperClass(mymap);
        // 4 设置最终输出类型
        job.setMapOutputKeyClass(mymapkey);
        job.setMapOutputValueClass(mymapvalue);

        //缓存小表的数据
        job.addCacheArchive(new URI(args3));

        // 设置reducetask个数为0
        job.setNumReduceTasks(num);

        //判断输出路径是否存在
        Path path = new Path(args2);
        FileSystem fs = FileSystem.get(conf);
        if(fs.exists(path)) {
            fs.delete(path, true);
        }

        // 5 设置输入和输出路径
        FileInputFormat.setInputPaths(job, new Path(args1));
        FileOutputFormat.setOutputPath(job, new Path(args2));

        // 6 提交
        job.waitForCompletion(true);
    }
}


import com.hfl.driver.Drive;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;

import java.io.IOException;
import java.net.URISyntaxException;

public class MapJoinMain {

    public static void main(String[] args) throws ClassNotFoundException, URISyntaxException, InterruptedException, IOException {
        args = new String[]{"F:\\input\\order.txt", "F:\\input\\mapper", "file:///F:/input/pd.txt"};
        Drive.run(MapJoinMain.class, MapJoin.class, Text.class, NullWritable.class, 0, args[0], args[1], args[2]);
    }

}

运行结果:
在这里插入图片描述

在这里插入图片描述

reduce join

在这里插入图片描述

1）原理：

Map端的主要工作：为来自不同表(文件)的key/value对打标签以区别不同来源的记录。然后用连接字段作为key，其余部分和新加的标志作为value，最后进行输出。
Reduce端的主要工作：在reduce端以连接字段作为key的分组已经完成，我们只需要在每一个分组当中将那些来源于不同文件的记录(在map阶段已经打标志)分开，最后进行合并就ok了。

2）该方法的缺点

这种方式的缺点很明显就是会造成map和reduce端也就是shuffle阶段出现大量的数据传输，效率很低。

在这里插入图片描述
TableBean

import lombok.Getter;
import lombok.Setter;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
 * order.txt
 *
 201801	01	1
 201802	02	2
 201803	03	3
 201804	01	4
 201805	02	5
 201806	03	6
 */
/**
 * pd.txt
 01	小米
 02	华为
 03	格力
 */
@Setter
@Getter
public class TableBean implements Writable {
    //订单id
    private String order_id;
    //产品id
    private String p_id;
    //产品数量
    private int amonut;
    //产品名称
    private String pname;
    //标记
    private String flag;

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(order_id);
        dataOutput.writeUTF(p_id);
        dataOutput.writeInt(amonut);
        dataOutput.writeUTF(pname);
        dataOutput.writeUTF(flag);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        this.order_id = dataInput.readUTF();
        this.p_id = dataInput.readUTF();
        this.amonut = dataInput.readInt();
        this.pname = dataInput.readUTF();
        this.flag = dataInput.readUTF();
    }

    @Override
    public String toString() {
        return this.order_id+"\t"+this.pname+"\t"+this.amonut;
    }
}

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

public class ReduceJoinMap extends Mapper<LongWritable,Text,Text,TableBean> {
    TableBean tableBean = new TableBean();
    Text t = new Text();
    
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //获取文件的路径
        FileSplit fileSplit = (FileSplit) context.getInputSplit();
        //每个文件的名字
        String name = fileSplit.getPath().getName();
        //获取数据
        String line = value.toString();
        //判断，根据文件的名字不同添加标记
        if (name.equals("order.txt")){
            String[] fields = line.split("\t");
            //封装
            tableBean.setOrder_id(fields[0]);
            tableBean.setP_id(fields[1]);
            tableBean.setAmonut(Integer.parseInt(fields[2]));
            tableBean.setFlag("0");
            tableBean.setPname("");

            t.set(fields[1]);
        }else{
            String[] fields = line.split("\t");
            //封装
            tableBean.setP_id(fields[0]);
            tableBean.setPname(fields[1]);
            tableBean.setFlag("1");
            tableBean.setOrder_id("");
            tableBean.setAmonut(0);

            t.set(fields[0]);
        }
        context.write(t,tableBean);
    }
}

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

import static org.apache.commons.beanutils.BeanUtils.copyProperties;


public class ReduceJoinReduce extends Reducer<Text,TableBean, TableBean, NullWritable>  {

    @Override
    protected void reduce(Text key, Iterable<TableBean> values, Context context) throws IOException, InterruptedException {
        //创建集合，存储订单表的对象
        ArrayList<TableBean> orderbeans = new ArrayList<TableBean>();
        //存储产品表对象
        TableBean pdbean  = new TableBean();
        for (TableBean bean:values) {
            //判断是否是订单表
            if("0".equals(bean.getFlag())){
                //定义一个存储order.txt的对象
                TableBean orderbean = new TableBean();
                try {
                    copyProperties(orderbean,bean);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
                orderbeans.add(orderbean);

            }else {
                //拷贝传递过来的产品表到内存
                try {
                    copyProperties(pdbean,bean);
                } catch (IllegalAccessException e) {
                    e.printStackTrace();
                } catch (InvocationTargetException e) {
                    e.printStackTrace();
                }
            }
            System.out.println("a");
        }
        System.out.println("==========分隔符========");
        for (TableBean bean2:orderbeans) {
            //将产品表里面名字传到定点表里面
            bean2.setPname(pdbean.getPname());
            //数据输出
            context.write(bean2,NullWritable.get());
        }
    }
    
}

import com.hfl.driver.Drive;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import java.io.IOException;
import java.net.URISyntaxException;

public class ReduceJoin {
    public static void main(String[] args) throws ClassNotFoundException, URISyntaxException, InterruptedException, IOException {
        args = new String[]{"F:\\input\\reduce", "F:\\input\\reduce\\red"};

        Drive.run(ReduceJoin.class, ReduceJoinMap.class, Text.class, TableBean.class, ReduceJoinReduce.class,
                TableBean.class, NullWritable.class, args[0], args[1]);
    }
}

运行结果:
在这里插入图片描述

在这里插入图片描述

怒放de生命2010

关注

1
点赞
踩
7

收藏

觉得还不错? 一键收藏
打赏
0
评论
hadoop之Map join和Reduce join (13)

目录Map join（Distributedcache分布式缓存）使用场景解决方案具体办法：采用distributedcache实例：reduce join1）原理：2）该方法的缺点Map join（Distributedcache分布式缓存）使用场景一张表十分小、一张表很大。解决方案在map端缓存多张表，提前处理业务逻辑，这样增加map端业务，减少reduce端数据的压力，尽可能的减...
复制链接

扫一扫