大数据_Shuffle、MapReduce编程案例(数据去重、多表查询、倒排索引、使用单元测试)

最新推荐文章于 2024-08-08 10:34:27 发布

Casablanca_jhBi

最新推荐文章于 2024-08-08 10:34:27 发布

阅读量824

点赞数

分类专栏：大数据

本文链接：https://blog.csdn.net/weixin_37243717/article/details/79032654

版权

大数据专栏收录该内容

39 篇文章 1 订阅

订阅专栏



一、什么是Shuffle（洗牌） ----> MapReduce核心
    1、序列化
    2、排序
    3、分区
    4、合并


二、MapReduce编程案例 ------> 掌握方法：如何开发一个程序
    1、数据去重:
        复习：SQL：distinct去掉重复的数据
                           作用于后面所有的列

    复习（学习）：Oracle中的多表查询
        根据连接条件的不同：
        （1）等值连接
        （2）不等值连接
        （3）外链接
        （4）自连接

        注意：（1）多表查询和子查询，尽量使用哪个？（多表查询）
              （2）一般：多表查询的表的个数<=3 如果超过5个表，性能就差

    2、多表查询：等值连接
          查询员工信息：部门名称、员工姓名

            select d.dname,e.ename
            from emp e,dept d 
            where e.deptno=d.deptno;    

         问题：如果实现三张表？？


    3、多表查询：自连接: 通过表的别名，将同一张表看成多张表
            查询员工信息：老板姓名   员工的姓名

            select b.ename,e.ename          
            from emp b,emp e
            where b.empno=e.mgr;

            注意：产生的笛卡尔积= 原表的平方 ----> 自连接操作不适合操作大表
                  更好的方式：在Oracle中，使用层次查询（树）来取代自连接

    4、倒排索引


    5、使用单元测试：MRUnit

这里写图片描述

去重

package demo.mr.distinct;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class DistinctMain {

    public static void main(String[] args) throws Exception {
        // 创建一个job：job = map + reduce
        Job job = Job.getInstance(new Configuration());

        //指定任务的入口
        job.setJarByClass(DistinctMain.class);

        //指定任务的Mapper和输出的数据类型: k2  v2
        job.setMapperClass(DistinctMapper.class);
        job.setMapOutputKeyClass(Text.class);    //指定k2
        job.setMapOutputValueClass(NullWritable.class);  //指定v2

        //指定任务的Reducer和输出的数据类型: k4 v4
        job.setReducerClass(DistinctReducer.class);
        job.setOutputKeyClass(Text.class);   //指定k4
        job.setOutputValueClass(NullWritable.class);   //指定v4

        //指定输入的路径（map）、输出的路径（reduce）
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //执行任务
        job.waitForCompletion(true);

    }

}

package demo.mr.distinct;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;


public class DistinctMapper extends Mapper<LongWritable, Text, Text, NullWritable> {

    @Override
    protected void map(LongWritable key1, Text value1, Context context)
            throws IOException, InterruptedException {
        // 数据  7654,MARTIN,SALESMAN,7698,1981/9/28,1250,1400,30
        String data = value1.toString();

        //分词
        String[] words = data.split(",");

        //将job作为key2进行输出
        context.write(new Text(words[2]), NullWritable.get());
    }

}

package demo.mr.distinct;

import java.io.IOException;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class DistinctReducer extends Reducer<Text, NullWritable, Text, NullWritable> {

    @Override
    protected void reduce(Text k3, Iterable<NullWritable> v3,Context context) throws IOException, InterruptedException {
        context.write(k3, NullWritable.get());
    }
}

笛卡尔积

使用MR实现等值连接

这里写图片描述

代码实现

package demo.mr.multitable;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class MultiTableQueryMain {

    public static void main(String[] args) throws Exception {
        // 创建一个job：job = map + reduce
        Job job = Job.getInstance(new Configuration());

        //指定任务的入口
        job.setJarByClass(MultiTableQueryMain.class);

        //指定任务的Mapper和输出的数据类型: k2  v2
        job.setMapperClass(MultiTableQueryMapper.class);
        job.setMapOutputKeyClass(IntWritable.class);    //指定k2
        job.setMapOutputValueClass(Text.class);  //指定v2

        //指定任务的Reducer和输出的数据类型: k4 v4
        job.setReducerClass(MultiTableQueryReducer.class);
        job.setOutputKeyClass(Text.class);   //指定k4
        job.setOutputValueClass(Text.class);   //指定v4

        //指定输入的路径（map）、输出的路径（reduce）
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //执行任务
        job.waitForCompletion(true);        

    }
}

package demo.mr.multitable;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class MultiTableQueryMapper extends Mapper<LongWritable, Text, IntWritable, Text> {

    @Override
    protected void map(LongWritable key1, Text value1, Context context)
            throws IOException, InterruptedException {
        // 数据：可能是部门、也可能是员工
        String data = value1.toString();

        //分词
        String[] words = data.split(",");

        //判断数组的长度
        if(words.length == 3){
            //部门表：部门号  部门名称
            context.write(new IntWritable(Integer.parseInt(words[0])), new Text("*" + words[1]));
        }else{
            //员工表：员工的部门号  姓名
            context.write(new IntWritable(Integer.parseInt(words[7])), new Text(words[1]));
        }
    }
}

package demo.mr.multitable;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class MultiTableQueryReducer extends Reducer<IntWritable, Text, Text, Text> {

    @Override
    protected void reduce(IntWritable k3, Iterable<Text> v3, Context context)
            throws IOException, InterruptedException {
        //定义变量：保存 部门名称 和 员工姓名
        String dname = "";
        String empNameList = "";

        for(Text v:v3){
            String str = v.toString();

            //找到*号的位置
            int index = str.indexOf("*");
            if(index >= 0){
                //代表的是部门名称
                dname = str.substring(1);
            }else{
                //代表的就是员工姓名
                empNameList = str + ";" + empNameList;
            }
        }

        //输出
        context.write(new Text(dname), new Text(empNameList));
    }
}

这里写图片描述

自连接代码实现

package demo.mr.selfjoin;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class SelfJoinMain {

    public static void main(String[] args) throws Exception {
        // 创建一个job：job = map + reduce
        Job job = Job.getInstance(new Configuration());

        //指定任务的入口
        job.setJarByClass(SelfJoinMain.class);

        //指定任务的Mapper和输出的数据类型: k2  v2
        job.setMapperClass(SelfJoinMapper.class);
        job.setMapOutputKeyClass(IntWritable.class);    //指定k2
        job.setMapOutputValueClass(Text.class);  //指定v2

        //指定任务的Reducer和输出的数据类型: k4 v4
        job.setReducerClass(SelfJoinReducer.class);
        job.setOutputKeyClass(Text.class);   //指定k4
        job.setOutputValueClass(Text.class);   //指定v4

        //指定输入的路径（map）、输出的路径（reduce）
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //执行任务
        job.waitForCompletion(true);
    }

}

package demo.mr.selfjoin;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class SelfJoinMapper extends Mapper<LongWritable, Text, IntWritable, Text> {

    @Override
    protected void map(LongWritable key1, Text value1, Context context)
            throws IOException, InterruptedException {
        // 数据：7654,MARTIN,SALESMAN,7698,1981/9/28,1250,1400,30
        String data = value1.toString();

        //分词
        String[] words = data.split(",");

        //输出
        //1. 作为老板表
        context.write(new IntWritable(Integer.parseInt(words[0])), new Text("*"+words[1]));

        //2. 作为员工表
        try{
            context.write(new IntWritable(Integer.parseInt(words[3])), new Text(words[1]));
        }catch(Exception ex){
            //如果产生了Exception，表示：大老板
            context.write(new IntWritable(-1), new Text(words[1]));
        }
    }

}

package demo.mr.selfjoin;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class SelfJoinReducer extends Reducer<IntWritable, Text, Text, Text> {

    @Override
    protected void reduce(IntWritable k3, Iterable<Text> v3, Context context)
            throws IOException, InterruptedException {
        //定义变量：老板姓名  员工姓名
        String bossName = "";
        String empNameList = "";

        for(Text t:v3){
            String str = t.toString();

            //判断是否存在*号
            int index = str.indexOf("*");
            if(index >=0 ){
                //表示：老板姓名
                bossName = str.substring(1);
            }else{
                //员工的姓名
                empNameList = str + ";" + empNameList;
            }
        }

        //输出
        //判断：如果存在老板和员工  才输出
        if(bossName.length() > 0 && empNameList.length() > 0 )
            context.write(new Text(bossName), new Text(empNameList));
    }
}

这里写图片描述

代码实现

package demo.revertedindex;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class RevertedIndexMapper extends Mapper<LongWritable, Text, Text, Text> {

    @Override
    protected void map(LongWritable key1, Text value1, Context context)
            throws IOException, InterruptedException {
        //得到数据来自哪个文件:  /indexdata/data01.txt
        String path = ((FileSplit)context.getInputSplit()).getPath().toString();

        //得到最后一个斜线的位置
        int index = path.lastIndexOf("/");
        //得到文件名
        String fileName = path.substring(index + 1);

        //数据：I love Beijing *******
        String data = value1.toString();
        String[] words = data.split(" ");

        //输出
        for(String w:words){
            context.write(new Text(w+":"+fileName), new Text("1"));
        }
    }

}

package demo.revertedindex;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class RevertedIndexReducer extends Reducer<Text, Text, Text, Text> {

    @Override
    protected void reduce(Text k3, Iterable<Text> v3, Context context)
            throws IOException, InterruptedException {
        // 对combiner数据value，拼加
        String str = "";

        for(Text t:v3){
            str = "("+t.toString()+")" + str;
        }

        //输出
        context.write(k3, new Text(str));
    }

}

package demo.revertedindex;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class RevertedIndexCombiner extends Reducer<Text, Text, Text, Text> {

    @Override
    protected void reduce(Text k21, Iterable<Text> v21, Context context)
            throws IOException, InterruptedException {
        // 求和：对同一个文件中的某个单词进行求和
        int total = 0;
        for(Text v:v21){
            total = total + Integer.parseInt(v.toString());
        }

        //分离：单词和文件名  k21:  love:data01.txt
        String data = k21.toString();
        //找到：的位置
        int index = data.indexOf(":");

        String word = data.substring(0, index);//单词
        String fileName = data.substring(index+1);//文件名

        //输出
        context.write(new Text(word), new Text(fileName+":"+total));
    }

}

package demo.revertedindex;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class RevertedIndexMain {

    public static void main(String[] args) throws Exception {
        // 创建一个job：job = map + reduce
        Job job = Job.getInstance(new Configuration());

        //指定任务的入口
        job.setJarByClass(RevertedIndexMain.class);

        //指定任务的Mapper和输出的数据类型: k2  v2
        job.setMapperClass(RevertedIndexMapper.class);
        job.setMapOutputKeyClass(Text.class);    //指定k2
        job.setMapOutputValueClass(Text.class);  //指定v2

        //设置任务的combiner
        job.setCombinerClass(RevertedIndexCombiner.class);

        //指定任务的Reducer和输出的数据类型: k4 v4
        job.setReducerClass(RevertedIndexReducer.class);
        job.setOutputKeyClass(Text.class);   //指定k4
        job.setOutputValueClass(Text.class);   //指定v4

        //指定输入的路径（map）、输出的路径（reduce）
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //执行任务
        job.waitForCompletion(true);

    }

}

单元测试

package demo.wc;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

//public class WordCountMapper extends Mapper<k1, v1, k2, v2> {
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

    @Override
    protected void map(LongWritable key1, Text value1, Context context)
            throws IOException, InterruptedException {
        /*
         * context 代表Mapper的上下文
         * 上文：HDFS
         * 下文：Reducer
         */

        //取出数据:  I love Beijing
        String data = value1.toString();

        //分词
        String[] words = data.split(" ");

        //输出
        for(String word:words){
            //            k2 就是 单词                          v2: 记一次数
            context.write(new Text(word), new IntWritable(1));
        }
    }
}

package demo.wc;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

//public class WordCountReducer extends Reducer<k3, v3, k4, v4> {
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

    @Override
    protected void reduce(Text k3, Iterable<IntWritable> v3,Context context) throws IOException, InterruptedException {
        /*
         * context 代表reduce的上下文
         * 上文：Mapper
         * 下文：HDFS
         */

        //对v3进行求和
        int total = 0;
        for(IntWritable v:v3){
            total += v.get();
        }

        //输出：k4 单词     v4 频率
        context.write(k3, new IntWritable(total));
    }

}

package demo.wc;

import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mrunit.mapreduce.MapDriver;
import org.apache.hadoop.mrunit.mapreduce.MapReduceDriver;
import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
import org.junit.Test;

public class MRUnitWordCount {

    @Test
    public void testMapper() throws Exception{
        //设置一个环境变量
        System.setProperty("hadoop.home.dir", "D:\\temp\\hadoop-2.4.1\\hadoop-2.4.1");

        //创建一个测试对象
        WordCountMapper mapper = new WordCountMapper();

        //创建一个MapDriver进行单元测试
        MapDriver<LongWritable, Text, Text, IntWritable> driver = new MapDriver<>(mapper);

        //指定Map的输入数据: k1  v1
        driver.withInput(new LongWritable(1), new Text("I love Beijing"));

        //指定Map的输出：k2   v2  ----> 是我们期望得到结果
        driver.withOutput(new Text("I"), new IntWritable(1))
              .withOutput(new Text("love"), new IntWritable(1))
              .withOutput(new Text("Beijing"), new IntWritable(1));

        //执行单元测试：对比  期望的结果 和 实际的结果
        driver.runTest();
    }

    @Test
    public void testReducer() throws Exception{
        //设置一个环境变量
        System.setProperty("hadoop.home.dir", "D:\\temp\\hadoop-2.4.1\\hadoop-2.4.1");

        //创建一个测试对象
        WordCountReducer reducer = new WordCountReducer();

        //创建一个ReducerDriver进行单元测试
        ReduceDriver<Text, IntWritable, Text, IntWritable> driver = new ReduceDriver<>(reducer);

        //构造v3：List
        List<IntWritable> value3 = new ArrayList<>();
        value3.add(new IntWritable(1));
        value3.add(new IntWritable(1));
        value3.add(new IntWritable(1));


        //指定reducer的输入
        driver.withInput(new Text("Beijing"), value3);


        //指定reducer的输出
        driver.withOutput(new Text("Beijing"), new IntWritable(3));


        //执行测试
        driver.runTest();
    }

    @Test
    public void testJob() throws Exception{
        //设置一个环境变量
        System.setProperty("hadoop.home.dir", "D:\\temp\\hadoop-2.4.1\\hadoop-2.4.1");

        //创建一个测试对象
        WordCountMapper mapper = new WordCountMapper();     
        WordCountReducer reducer = new WordCountReducer();      

        //创建一个Driver
        //MapReduceDriver<K1, V1, K2, V2, K4, V4>
        MapReduceDriver<LongWritable, Text, Text, IntWritable, Text, IntWritable>
                driver = new MapReduceDriver<>(mapper,reducer);

        //指定Map输入的数据
        driver.withInput(new LongWritable(1), new Text("I love Beijing"))
              .withInput(new LongWritable(4), new Text("I love China"))
              .withInput(new LongWritable(7), new Text("Beijing is the capital of China"));

        //指定Reducer输出
//      driver.withOutput(new Text("I"), new IntWritable(2))
//            .withOutput(new Text("love"), new IntWritable(2))
//            .withOutput(new Text("Beijing"), new IntWritable(2))
//            .withOutput(new Text("China"), new IntWritable(2))
//            .withOutput(new Text("is"), new IntWritable(1))
//            .withOutput(new Text("the"), new IntWritable(1))
//            .withOutput(new Text("capital"), new IntWritable(1))
//            .withOutput(new Text("of"), new IntWritable(1));

        //需要考虑排序
        driver.withOutput(new Text("Beijing"), new IntWritable(2))
              .withOutput(new Text("China"), new IntWritable(2))
              .withOutput(new Text("I"), new IntWritable(2))
              .withOutput(new Text("capital"), new IntWritable(1))
              .withOutput(new Text("is"), new IntWritable(1))
              .withOutput(new Text("love"), new IntWritable(2))
              .withOutput(new Text("of"), new IntWritable(1))
              .withOutput(new Text("the"), new IntWritable(1));

        driver.runTest();
    }
}