【大数据离线开发】6.3 MapReduce案例锦集

小卓仗剑走天涯

已于 2022-08-01 14:46:33 修改

阅读量983

点赞数 1

分类专栏：大数据从入门到精通文章标签：大数据 mapreduce 数据库

于 2022-07-19 14:32:42 首次发布

本文链接：https://blog.csdn.net/m0_66345324/article/details/125871473

版权

大数据从入门到精通专栏收录该内容

28 篇文章

订阅专栏

文章目录

- - 6.3 MapReduce案例集锦

6.3 MapReduce案例集锦

6.3.1 数据去重

复习SQL：distinct去掉重复的数据，作用于后面所有的列，只要组合起来的数据不一样就可以

一个列：
	select job from emp;
	select distinct job from emp;
多个列：
	select distinct deptno, job from emp;

案例：使用 MapReduce 实现 distinct 对一个列的去重

DistinctMapper.java

public class DistinctMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
    @Override
    protected void map(LongWritable key1, Text value1, Context context) throws IOException, InterruptedException {
        String data = value1.toString();

        String[] words = data.split(",");

        context.write(new Text(words[2]),NullWritable.get());
    }
}

DistinctReducer.java

public class DistinctReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
    @Override
    protected void reduce(Text key3, Iterable<NullWritable> values3, Context context) throws IOException, InterruptedException {
        context.write(key3, NullWritable.get());
    }
}

DistinctMain.java

public class DistinctMain {
    public static void main(String[] args) throws Exception {
        //1、创建一个任务,指定任务的入口
        Job job = Job.getInstance(new Configuration());
        job.setJarByClass(DistinctMain.class);

        //2、指定任务的map和map输出的数据类型
        job.setMapperClass(DistinctMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        //3、指定任务的Reduce
        job.setReducerClass(DistinctReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        //4、指定任务的输入路径、任务的输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //5、执行任务
        job.waitForCompletion(true);
    }
}

在这里插入图片描述

6.3.2 多表查询：等值连接

复习多表查询：关系型数据库中的多表查询（子查询：在 Oracle 中，绝大多部分的组查询都是转换成多表查询来执行）

笛卡尔积：列数相加，行数相乘，如果不设置条件，查询出来的结果是笛卡尔积全集（列数相乘）
根据连接条件的不同
- 等值连接
- 不等值连接
- 外连接
- 自连接

在这里插入图片描述

案例：等值连接实现下面的SQL语句

select ename, dname from emp, dept where emp.deptno = dept.deptno;

在这里插入图片描述

EqualJoinMapper.java

public class EqualJoinMapper extends Mapper<LongWritable, Text, IntWritable, Text> {
    @Override
    protected void map(LongWritable key1, Text value1, Context context) throws IOException, InterruptedException {
        String data = value1.toString();

        String[] words = data.split(",");

        //判断数组的长度
        if (words.length == 3){
            //得到的是部门数据：部门号 部门名称
            context.write(new IntWritable(Integer.parseInt(words[0])), new Text("*"+words[1]));
        }else {
            //得到的是员工数据:员工的部门号 员工姓名
            context.write(new IntWritable(Integer.parseInt(words[7])), new Text(words[1]));
        }
    }
}

EqualReducer.java

public class EqualReducer extends Reducer<IntWritable, Text ,Text, Text> {
    @Override
    protected void reduce(IntWritable key3, Iterable<Text> values3, Context context) throws IOException, InterruptedException {
        //处理v3：可能是部门名称，也可能是员工的姓名
        String dname = "";
        String empNameList = "";

        for (Text value:values3) {
            String str = value.toString();

            int index = str.indexOf("*");
            if (index >= 0){
                //代表是部门的名称
                dname = str.substring(1);
            }else {
                //代表的是员工的名称
                empNameList = str + ";" + empNameList;
            }
        }

        context.write(new Text(dname), new Text(empNameList));
    }
}

EqualJoinMain.java

public class EqualJoinMain {
    public static void main(String[] args) throws Exception {
        //1、创建一个任务,指定任务的入口
        Job job = Job.getInstance(new Configuration());
        job.setJarByClass(EqualJoinMain.class);

        //2、指定任务的map和map输出的数据类型
        job.setMapperClass(EqualJoinMapper.class);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Text.class);

        //3、指定任务的Reduce
        job.setReducerClass(EqualReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        //4、指定任务的输入路径、任务的输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //5、执行任务
        job.waitForCompletion(true);
    }
}

在这里插入图片描述

6.3.3 多表查询：自连接

自连接就是一张表的连接操作

举例：查询员工信息，要求显示：员工老版的名字员工的名字

select b.ename, e.ename
from emp b, emp e
where b.empno = e.mgr;

在Oracle中，当查询的数据满足是一棵树的时候，可以使用层次查询来取代自连接

在这里插入图片描述

SelfJoinMapper.java

public class SelfJoinMapper extends Mapper<LongWritable, Text, IntWritable,Text> {
    @Override
    protected void map(LongWritable key1, Text value1, Context context) throws IOException, InterruptedException {
        /**
         *注意一个问题：如果数据存在非法数据，一定要处理一下（数据清洗）
         * 如果产生意外，一定要捕获
         */
        //数据：7654,MARTIN,SALESMAN,7698,1998/9/29,1250,1400,30
        String data = value1.toString();
        String[] words = data.split(",");

        // 作为老板表，输出员工号
        context.write(new IntWritable(Integer.parseInt(words[0])), new Text("*" + words[1]));

        // 作为员工表，输出老板号
        context.write(new IntWritable(Integer.parseInt(words[3])), new Text(words[1]));
    }
}

SelfJoinReduce.java

public class SelfJoinReduce extends Reducer<IntWritable, Text, Text, Text> {
    @Override
    protected void reduce(IntWritable key3, Iterable<Text> values3, Context context) throws IOException, InterruptedException {
        String bossName = "";
        String empNameList = "";

        for (Text t : values3){
            String str = t.toString();
            //判断是否存在*号
            int index = str.indexOf("*");
            if (index >= 0){
                //老板的姓名
                bossName = str.substring(1);
            }else {
                //员工的姓名
                empNameList = str + ";" + empNameList;
            }
        }

        //输出:如果存在老师，也存在员工，才进行输出
        if (bossName.length() > 0 && empNameList.length() > 0) {
            context.write(new Text(bossName), new Text(empNameList));
        }
    }
}

SelfJoinMain.java

public class SelfJoinMain {
    public static void main(String[] args) throws Exception {
        //1、创建一个任务,指定任务的入口
        Job job = Job.getInstance(new Configuration());
        job.setJarByClass(SelfJoinMain.class);

        //2、指定任务的map和map输出的数据类型
        job.setMapperClass(SelfJoinMapper.class);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(Text.class);

        //3、指定任务的Reduce
        job.setReducerClass(SelfJoinReduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        //4、指定任务的输入路径、任务的输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //5、执行任务
        job.waitForCompletion(true);
    }
}

在这里插入图片描述

6.3.4 倒排索引

“ 倒排索引”是文档检索系统中最常用的数据结构，被广泛地应用于全文搜索引擎。它主要是用来存储某个单词（或词组）在一个文档或一组文档中的存储位置的映射，即提供了一种根据内容来查找文档的方式。由于不是根据文档来确定文档所包含的内容，而是进行相反的操作，因而称为倒排索引（Inverted Index）。

在这里插入图片描述

创建三个文本文件

vi data01.txt------------I love Beijing and love Shanghai
vi data02.txt------------I love China
vi data03.txt------------Beijing is the capital of China

hdfs dfs -mkdir /indexdata
hdfs dfs -put data0*.txt /indexdata

RevertedIndexMapper.java

public class RevertedIndexMapper extends Mapper<LongWritable, Text, Text, Text> {
    @Override
    protected void map(LongWritable key1, Text value1, Context context) throws IOException, InterruptedException {
        //得到对应的文件名
        String path = ((FileSplit) context.getInputSplit()).getPath().toString();

        //解析出文件名
        //得到最后一个斜线的位置
        int index = path.lastIndexOf("/");
        String fileName = path.substring(index + 1);

        //数据： I love Beijing and love Shanghai
        String data = value1.toString();
        String[] words = data.split(" ");

        //输出
        for (String word: words){
            context.write(new Text(word+":"+fileName), new Text("1"));
        }
    }
}

RevertedIndexCombiner.java

public class RevertedIndexCombiner extends Reducer<Text, Text, Text, Text> {
    @Override
    protected void reduce(Text key21, Iterable<Text> values21, Context context) throws IOException, InterruptedException {
        //求和：对同一个文件中的单词进行求和
        int total = 0;
        for(Text v : values21){
            total = total + Integer.parseInt(v.toString());
        }

        //key21是：love:datda01.txt
        String data = key21.toString();
        //找到冒号的位置
        int index = data.indexOf(":");

        String word = data.substring(0, index);//单词
        String fileName = data.substring(index + 1);//文件名

        //输出
        context.write(new Text(word), new Text(fileName + ":"+ total));
    }
}

RevertedIndexRuducer.java

public class RevertedIndexRuducer extends Reducer<Text, Text, Text, Text> {
    @Override
    protected void reduce(Text key3, Iterable<Text> values3, Context context) throws IOException, InterruptedException {
        String str = "";

        for (Text t : values3){
            str = "(" + t.toString() + ")" + str;
        }

        context.write(key3, new Text(str));
    }
}

RevertedIndexMain.java

public class RevertedIndexMain {
    public static void main(String[] args) throws Exception {
        //1、创建一个任务,指定任务的入口
        Job job = Job.getInstance(new Configuration());
        job.setJarByClass(RevertedIndexMain.class);

        //2、指定任务的map和map输出的数据类型
        job.setMapperClass(RevertedIndexMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        //指定任务的Combiner
        job.setCombinerClass(RevertedIndexCombiner.class);

        //3、指定任务的Reduce
        job.setReducerClass(RevertedIndexRuducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        //4、指定任务的输入路径、任务的输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //5、执行任务
        job.waitForCompletion(true);
    }
}

在这里插入图片描述

6.3.5使用单元测试

使用的时候需要从官网http://mrunit.apache.org/下载jar包
基本原理是JUnit和EasyMock，其核心的单元测试依赖于JUnit，并且MRUnit实现了一套Mock对象来控制MapReduce框架的输入和输出；语法也比较简单。

注意：需要把mockito-all-1.8.5.jar 从Build Path中去掉

以WordCount为例：

public class MRUnitWordCount {

	/**
	 * 测试Mapper
	 * @throws Exception 抛出异常
	 */
	@Test
	public void testMapper() throws Exception{
		//设置一个环境变量
		System.setProperty("hadoop.home.dir", "D:\\temp\\hadoop-2.4.1\\hadoop-2.4.1");

		//创建一个测试对象
		WordCountMapper mapper = new WordCountMapper();

		//创建一个MapDriver进行单元测试
		MapDriver<LongWritable, Text, Text, IntWritable> driver = new MapDriver<>(mapper);

		//指定Map的输入数据: k1  v1
		driver.withInput(new LongWritable(1), new Text("I love Beijing"));

		//指定Map的输出：k2   v2  ----> 是我们期望得到结果
		driver.withOutput(new Text("I"), new IntWritable(1))
				.withOutput(new Text("love"), new IntWritable(1))
				.withOutput(new Text("Beijing"), new IntWritable(1));

		//执行单元测试：对比  期望的结果 和 实际的结果
		driver.runTest();
	}

	/**
	 * 测试Reducer
	 * @throws Exception 异常
	 */
	@Test
	public void testReducer() throws Exception{
		//设置一个环境变量
		System.setProperty("hadoop.home.dir", "D:\\temp\\hadoop-2.4.1\\hadoop-2.4.1");

		//创建一个测试对象
		WordCountReducer reducer = new WordCountReducer();

		//创建一个ReducerDriver进行单元测试
		ReduceDriver<Text, IntWritable, Text, IntWritable> driver = new ReduceDriver<>(reducer);

		//构造v3：List
		List<IntWritable> value3 = new ArrayList<>();
		value3.add(new IntWritable(1));
		value3.add(new IntWritable(1));
		value3.add(new IntWritable(1));


		//指定reducer的输入
		driver.withInput(new Text("Beijing"), value3);


		//指定reducer的输出
		driver.withOutput(new Text("Beijing"), new IntWritable(3));


		//执行测试
		driver.runTest();
	}

	/**
	 * 测试Mapperreduce
	 * @throws Exception 异常
	 */
	@Test
	public void testJob() throws Exception{
		//设置一个环境变量
		System.setProperty("hadoop.home.dir", "D:\\temp\\hadoop-2.4.1\\hadoop-2.4.1");

		//创建一个测试对象
		WordCountMapper mapper = new WordCountMapper();
		WordCountReducer reducer = new WordCountReducer();

		//创建一个Driver
		//MapReduceDriver<K1, V1, K2, V2, K4, V4>
		MapReduceDriver<LongWritable, Text, Text, IntWritable, Text, IntWritable>
				driver = new MapReduceDriver<>(mapper,reducer);

		//指定Map输入的数据
		driver.withInput(new LongWritable(1), new Text("I love Beijing"))
				.withInput(new LongWritable(4), new Text("I love China"))
				.withInput(new LongWritable(7), new Text("Beijing is the capital of China"));

		//指定Reducer输出
//		driver.withOutput(new Text("I"), new IntWritable(2))
//			  .withOutput(new Text("love"), new IntWritable(2))
//			  .withOutput(new Text("Beijing"), new IntWritable(2))
//			  .withOutput(new Text("China"), new IntWritable(2))
//			  .withOutput(new Text("is"), new IntWritable(1))
//			  .withOutput(new Text("the"), new IntWritable(1))
//			  .withOutput(new Text("capital"), new IntWritable(1))
//			  .withOutput(new Text("of"), new IntWritable(1));

		//需要考虑排序
		driver.withOutput(new Text("Beijing"), new IntWritable(2))
				.withOutput(new Text("China"), new IntWritable(2))
				.withOutput(new Text("I"), new IntWritable(2))
				.withOutput(new Text("capital"), new IntWritable(1))
				.withOutput(new Text("is"), new IntWritable(1))
				.withOutput(new Text("love"), new IntWritable(2))
				.withOutput(new Text("of"), new IntWritable(1))
				.withOutput(new Text("the"), new IntWritable(1));

		driver.runTest();
	}
}