MapReduce基础入门6

最新推荐文章于 2024-10-04 18:47:42 发布

陈万君Allen

最新推荐文章于 2024-10-04 18:47:42 发布

阅读量491

点赞数

分类专栏： Java和大数据文章标签： mapreduce hadoop 大数据

本文链接：https://blog.csdn.net/weixin_53280379/article/details/127183417

版权

Java和大数据专栏收录该内容

43 篇文章 6 订阅

订阅专栏

Hadoop系列

注：大家觉得博客好的话，别忘了点赞收藏呀，本人每周都会更新关于人工智能和大数据相关的内容，内容多为原创，Python Java Scala SQL 代码，CV NLP 推荐系统等，Spark Flink Kafka Hbase Hive Flume等等~写的都是纯干货，各种顶会的论文解读，一起进步。
今天继续和大家分享一下MapReduce基础入门6
#博学谷IT学习技术支持

前言

在这里插入图片描述
1、MapReduce会将一个大的计算任务进行拆分，拆分成小任务，让这些小任务在不同的计算机中进行处理,最后再将这些小任务的结果记性整体汇总

2、MapReduce分为两个阶段，一个Map阶段负责任务的拆分，一个是Reduce阶段，负责任务的汇总

3、整个MapReduce工作流程可以分为3个阶段：map、shuffle、reduce。

作者这里用又一个简单的案例来说明如何用MapReduce实现两张大表之间的Join操作。也称之为Reduce Join。

一、Reduce Join是什么？

reduce side join，顾名思义，在reduce阶段执行join关联操作。
这也是最容易想到和实现的join方式。因为通过shuffle过程就可以将相关的数据分到相同的分组中，这将为后面的join操作提供了便捷。
在这里插入图片描述

二、使用步骤

1.1.数据准备

第一个商品表
在这里插入图片描述
第二个订单表

在这里插入图片描述

任务很简单，两个表都有商品ID这个字段，需求：通过商品ID将两个表关联起来。

2.第一个Map阶段

这里Map阶段的难度在于，两张表的结构不一样，所以要分开写
通过(FileSplit) context.getInputSplit()强转成FileSplit得到文件的切片，然后可以通过getPath().getName()得到输入文件的名字，这样就可以单独处理不同结构的文件。

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;

public class Mapper_demo extends Mapper<LongWritable,Text,Text,Text>{
    @Override
    protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException {
        //获取文件切片
        FileSplit inputSplit = (FileSplit) context.getInputSplit();
        //获取源文件名字
        String fileName = inputSplit.getPath().getName();

        String[] array = value.toString().split("\\|");

        //获取k2
        if ("itheima_order_goods.txt".equals(fileName)){
            //订单文件
            String k2 = array[1];
            String v2 =  "o_" + array[0] + "\t" +array[2];
            context.write(new Text(k2),new Text(v2));

        }
        if ("itheima_goods.txt".equals(fileName)){
            //商品文件
            String k2 = array[0];
            String v2 = "g_" + array[0] + "\t" + array[2];
            context.write(new Text(k2),new Text(v2));
        }
    }
}

3.第一个Reduce阶段

核心思想是通过分组操作之后，会对K2商品ID进行去重，把相同K2商品ID的V2放到同一个集合种，就基本完成了Reduce Join操作，只是这样的结果非常乱，需要一些额外的操作，来让结果变的整洁，所以首先进行拼接操作。

public class Reducer_demo extends Reducer<Text,Text,Text, NullWritable> {
    ArrayList<String> orderList = new ArrayList<>();

    @Override
    protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
        //1.遍历集合，分别获取订单信息和商品信息
        orderList.clear();
        String goods_value = "";
        for (Text value : values) {
            if (value.toString().startsWith("o_")){
                orderList.add(value.toString().substring(2));
            }
            if (value.toString().startsWith("g_")){
                goods_value = value.toString().substring(2);
            }
        }

        //2.将订单信息和商品信息进行拼接
        for (String order : orderList) {
            context.write(new Text(order+ "\t" + goods_value),NullWritable.get());
        }
    }
}

4.第二个Map阶段Reduce阶段

通过第二个MapReduce任务对第一个的结果再进行排序，这样可以让结果更好看一点，其实另外一个目的是想通过两个MapReduce实现串行。字符串可以对K2自动排序，其实就简单切分一下指定K2。

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;

public class ReducerJoinDemo {
    public static class ReducerJoinSortMapper extends Mapper<LongWritable, Text,Text,Text>{
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, Text>.Context context) throws IOException, InterruptedException {
            context.write(new Text(value.toString().split("\t")[0]),value);
        }
    }

    public static class ReducerJoinSortReducer extends Reducer<Text,Text,Text, NullWritable>{
        @Override
        protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
            for (Text value : values) {
                context.write(value,NullWritable.get());
            }
        }
    }
}

5.结合串行的Driver运行入口

可以同时串行多个MapReduce任务。

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;
import org.apache.hadoop.mapreduce.lib.jobcontrol.JobControl;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class ControlledJobDriver {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        //第一个作业的job
        Job job1 = Job.getInstance(conf, "ControlledJob");
        //设置mr程序运行的主类
        job1.setJarByClass(ControlledJobDriver.class);
        //设置本次mr程序的mapper类型  reducer类
        job1.setMapperClass(Mapper_demo.class);
        job1.setReducerClass(Reducer_demo.class);
        //指定mapper阶段输出的key value数据类型
        job1.setMapOutputKeyClass(Text.class);
        job1.setMapOutputValueClass(Text.class);
        //指定reducer阶段输出的key value类型 也是mr程序最终的输出数据类型
        job1.setOutputKeyClass(Text.class);
        job1.setOutputValueClass(NullWritable.class);

        //配置本次作业的输入数据路径 和输出数据路径
        FileInputFormat.setInputPaths(job1, new Path("hdfs://node1:8020/input/goods"));
        FileOutputFormat.setOutputPath(job1, new Path("hdfs://node1:8020/output/goods"));


        //todo 将普通的作用包装成受控作业
        ControlledJob cj1 = new ControlledJob(conf);
        cj1.setJob(job1);


        //第二个作业的job
        Job job2 = Job.getInstance(conf, "ReducerJoinDemo");
        //设置mr程序运行的主类
        job2.setJarByClass(ControlledJobDriver.class);
        //设置本次mr程序的mapper类型  reducer类
        job2.setMapperClass(ReducerJoinDemo.ReducerJoinSortMapper.class);
        job2.setReducerClass(ReducerJoinDemo.ReducerJoinSortReducer.class);
        //指定mapper阶段输出的key value数据类型
        job2.setMapOutputKeyClass(Text.class);
        job2.setMapOutputValueClass(Text.class);
        //指定reducer阶段输出的key value类型 也是mr程序最终的输出数据类型
        job2.setOutputKeyClass(Text.class);
        job2.setOutputValueClass(NullWritable.class);
        FileInputFormat.setInputPaths(job2, new Path("hdfs://node1:8020/output/goods"));
        FileOutputFormat.setOutputPath(job2, new Path("hdfs://node1:8020/output/goods_sort"));

        //todo 将普通的作用包装成受控作业
        ControlledJob cj2 = new ControlledJob(conf);
        cj2.setJob(job2);

        //todo 设置作业之间的依赖关系
        cj2.addDependingJob(cj1);

        //todo 创建主控制器 控制上面两个作业 一起提交
        JobControl jc = new JobControl("myctrl");
        jc.addJob(cj1);
        jc.addJob(cj2);

        //使用线程启动JobControl
        Thread t = new Thread(jc);
        t.start();

        while (true) {
            if (jc.allFinished()) {
                System.out.println(jc.getSuccessfulJobList());
                jc.stop();
                break;
            }
        }
    }
}