MapReduce程序案例 MapJoin与ReduceJoin

最新推荐文章于 2024-05-30 21:35:29 发布

置顶宇文智

最新推荐文章于 2024-05-30 21:35:29 发布

阅读量163

点赞数

分类专栏：大数据技术文章标签： hadoop mapreduce

本文链接：https://blog.csdn.net/m0_38109926/article/details/107966615

版权

大数据技术专栏收录该内容

32 篇文章 2 订阅

订阅专栏

MapJoin案例应用

1）使用场景

Map Join适用于一张表十分小、一张表很大的场景。

2）优点

思考：在Reduce端处理过多的表，非常容易产生数据倾斜。怎么办？
在Map端缓存多张表，提前处理业务逻辑，这样增加Map端业务，减少Reduce端数据的压力，尽可能的减少数据倾斜。

3）具体办法：采用DistributedCache
（1）在Mapper的setup阶段，将文件读取到缓存集合中。
（2）在Driver驱动类中加载缓存。

//缓存普通文件到Task运行节点。

job.addCacheFile(new URI("file:///e:/cache/pd.txt"));

//如果是集群运行,需要设置HDFS路径

job.addCacheFile(new URI("hdfs://hadoop102:9820/cache/pd.txt"));

public class MapJoinMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
    private HashMap<String,String> pidMap = new HashMap<>();
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        //获取在driver中设置的缓存文件。
        URI[] cacheFiles = context.getCacheFiles();
        FileSystem fileSystem = FileSystem.get(context.getConfiguration());
        FSDataInputStream open = fileSystem.open(new Path(cacheFiles[0]));
        //转化为字符流，一行一行的读
        InputStreamReader isr = new InputStreamReader(open, "UTF-8");
        BufferedReader bufferedReader = new BufferedReader(isr);
        String line;
        String[] split;
        while (StringUtils.isNoneEmpty(line = bufferedReader.readLine())) {
            split = line.split("\t");
            pidMap.put(split[0], split[1]);
        }
    }
    private Text outK = new Text();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        context.getCounter("line","linenumber").increment(1);
        String line = value.toString();
        String[] split = line.split("\t");
        outK.set(split[0]+"\t"+pidMap.get(split[1])+"\t"+split[2]);
        context.write(outK,NullWritable.get());
    }
}

public class MapJoinDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);
        job.setJarByClass(MapJoinDriver.class);

        job.setMapperClass(MapJoinMapper.class);

//        job.setMapOutputKeyClass(Text.class);
//        job.setMapOutputValueClass(NullWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        job.setNumReduceTasks(0);   //没有reduce阶段
        job.addCacheFile(new URI(args[2]));

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        boolean b = job.waitForCompletion(true);
        System.out.println(b ? "执行成功！" : "执行失败！");
    }
}

ReduceJoin案例

在这里插入图片描述

public class ReduceJoinDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);
        job.setJarByClass(ReduceJoinDriver.class);

        job.setMapperClass(ReduceJoinMapper.class);
        job.setReducerClass(ReduceJoinReduce.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(TableBean.class);

        job.setOutputKeyClass(TableBean.class);
        job.setOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job, new Path("I:/input/inputtable2/"));
        FileOutputFormat.setOutputPath(job, new Path("I:/output/reduceOutput55/"));

        boolean b = job.waitForCompletion(true);
        System.out.println(b ? "执行成功！" : "执行失败！");

    }
}

public class ReduceJoinMapper extends Mapper<LongWritable, Text,Text,TableBean> {

    private String filename;
    private Text outK = new Text();
    private TableBean outV = new TableBean();

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        //获取切片信息
        InputSplit inputSplit = context.getInputSplit();
        FileSplit fileSplit = (FileSplit) inputSplit;
        filename = fileSplit.getPath().getName();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //以pid为key，tableBean为value
        String line = value.toString();
        String[] split = line.split("\t");
        if(filename.contains("order")){
            outK.set(split[1]);
            outV.setId(split[0]);
            outV.setPid(split[1]);
            outV.setPname("");
            outV.setAmount(Integer.parseInt(split[2]));
            outV.setFlag("order");
        }else{
            outK.set(split[0]);
            outV.setId("");
            outV.setPid(split[0]);
            outV.setPname(split[1]);
            outV.setAmount(0);
            outV.setFlag("pd");
        }
        context.write(outK,outV);
    }
}

public class ReduceJoinReduce extends Reducer<Text,TableBean,TableBean, NullWritable> {

    @Override
    protected void reduce(Text key, Iterable<TableBean> values, Context context) throws IOException, InterruptedException {
        ArrayList<TableBean> orderBeans = new ArrayList<>(); //保存每组order
        TableBean pdBean  = new TableBean();    //保存pd

        //Iterable是hadoop中比较特殊的集合，hadoop为了减少内存开销，
        // 只用了一个对象，如果想取数据，需要修改这个对象的中的数据
        for (TableBean value : values) {
            if("order".equals(value.getFlag())){
                //所以如果向把数据的都放入到集合中，就需要创建一个TableBean的对象
                TableBean tmp = new TableBean();
                try {
                    BeanUtils.copyProperties(tmp ,value);
                } catch (Exception e) {
                    e.printStackTrace();
                }
                orderBeans.add(tmp);
            }else{
                try {
                    BeanUtils.copyProperties(pdBean,value);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }
        for (TableBean orderBean : orderBeans) {
            orderBean.setPname(pdBean.getPname());
            context.write(orderBean,NullWritable.get());
        }
    }
}

public class TableBean  implements Writable {

    private String id;
    private String pid;
    private String pname;
    private int amount;
    private String flag;

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public String getPid() {
        return pid;
    }

    public void setPid(String pid) {
        this.pid = pid;
    }

    public String getPname() {
        return pname;
    }

    public void setPname(String pname) {
        this.pname = pname;
    }

    public int getAmount() {
        return amount;
    }

    public void setAmount(int amount) {
        this.amount = amount;
    }


    public String getFlag() {
        return flag;
    }

    public void setFlag(String flag) {
        this.flag = flag;
    }

    @Override
    public String toString() {
        return id + "\t" + pname + "\t" + amount;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(this.getId());
        out.writeUTF(this.getPid());
        out.writeInt(this.getAmount());
        out.writeUTF(this.getPname());
        out.writeUTF(this.getFlag());
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.id = in.readUTF();
        this.pid = in.readUTF();
        this.amount = in.readInt();
        //这里的如果序列化的时候用的是writeInt(),反序列化的时候一定也要用readInt();不然会报EOFException
        this.pname = in.readUTF();
        this.flag = in.readUTF();
    }
}

宇文智

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
MapReduce程序案例 MapJoin与ReduceJoin

MapJoin案例应用1）使用场景Map Join适用于一张表十分小、一张表很大的场景。2）优点思考：在Reduce端处理过多的表，非常容易产生数据倾斜。怎么办？在Map端缓存多张表，提前处理业务逻辑，这样增加Map端业务，减少Reduce端数据的压力，尽可能的减少数据倾斜。3）具体办法：采用DistributedCache （1）在Mapper的setup阶段，将文件读取到缓存集合中。（2）在Driver驱动类中加载缓存。//缓存普通文件到Task运行节点。job.addCac
复制链接

扫一扫