【经典】MapReduce WordCount案例：需求1-4

最新推荐文章于 2022-10-10 09:27:40 发布

时光不语,静待花开

最新推荐文章于 2022-10-10 09:27:40 发布

阅读量337

点赞数

分类专栏：大数据

本文链接：https://blog.csdn.net/BD_fuhong/article/details/96477126

版权

大数据专栏收录该内容

5 篇文章 0 订阅

订阅专栏

* 需求一：在一堆给定的文本文件中统计输出每一个单词出现的总次数
* 需求二：把单词按照ASCII码奇偶数分区
* 需求三：对每一个maptask的输出做局部汇总
*  方案一：写一个combiner类继承Reducer，其实是把WordCountReducer再写一次
*  方案二：直接把WordCountReducer指定给CombinerClass
* 需求四：大量小文件的切片优化

一、mapper阶段

package mapper;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * @ClassName WordCountMapper
 * @Description ToDo
 * @Author qin
 * @Date 2019/7/18 10:27
 * @Version 1.0
 **/

/**
 * 需求一：在一堆给定的文本文件中统计输出每一个单词出现的总次数
 * 需求二：把单词按照ASCII码奇偶数分区
 * 需求三：对每一个maptask的输出做局部汇总
 *  方案一：写一个combiner类继承Reducer，其实是把WordCountReducer再写一次
 *  方案二：直接把WordCountReducer指定给CombinerClass
 * 需求四：大量小文件的切片优化
 * key-in:所读文本的行其实偏移量==》用序列化结构LongWritable
 * value-in:所读文本的内容==》Text
 * key-out:单词==》Text
 * value-out:单词次数==》IntWritable
 *
 */
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

    //map阶段的业务逻辑写在自定义的map方法中
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //1.将maptask 传给我们的文本内容转为String
        String line = value.toString();

        //2.将string进行切分
        String[] split = line.split(" ");

        //3.将单词map为key-value对,并输出
        for (String word : split) {
            context.write(new Text(word), new IntWritable(1));
        }
    }
}

二、Reduce阶段

package reducer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @ClassName WordCountReducer
 * @Description ToDo
 * @Author qin
 * @Date 2019/7/18 11:29
 * @Version 1.0
 **/
public class WordCountReducer extends Reducer<Text, IntWritable, Text,IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int count = 0;
        //1.汇总各个key出现的次数
        for (IntWritable value:values){
            count += value.get();
        }

        //2.输出
        context.write(key,new IntWritable(count));
    }
}

三、driver阶段

package driver;


import mapper.WordCountMapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import partitioner.WordCountPartitioner;
import reducer.WordCountReducer;

import java.io.IOException;

/**
 * @ClassName WordCount
 * @Description ToDo
 * @Author qin
 * @Date 2019/7/18 11:38
 * @Version 1.0
 **/
public class WordCount {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //1.获取配置信息，创建job实例
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);

        //2.指定本程序的jar包所在的本地路径
        job.setJarByClass(WordCount.class);

        //3.关联mapper,reducer业务类
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);

        //4.指定mapper输出数据的kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //5.指定最终输出数据的kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //6.指定job的输入原始文件所在目录
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //需求二补：加载分区
        job.setPartitionerClass(WordCountPartitioner.class);
        job.setNumReduceTasks(2);//将会影响输出文件个数
        //需求三补：
        job.setCombinerClass(WordCountReducer.class);
        //需求四:
        job.setInputFormatClass(CombineTextInputFormat.class);
        CombineTextInputFormat.setMaxInputSplitSize(job,4194304);//4M
        CombineTextInputFormat.setMinInputSplitSize(job,2097152);//2M


        //7.提交
        job.submit();
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);

    }
}

四、自定义分区（需求二）

package partitioner;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

/**
 * 把单词按照 ASCII奇偶数分区
 * @ClassName WordCountPartitioner
 * @Description ToDo
 * @Author qin
 * @Date 2019/7/18 17:16
 * @Version 1.0
 **/
public class WordCountPartitioner extends Partitioner<Text, IntWritable> {
    @Override
    public int getPartition(Text text, IntWritable intWritable, int num) {
        //1.获取单词key首字母
        String substring = text.toString().substring(0, 1);
        char[] chars = substring.toCharArray();
        int result = chars[0];//int类型能直接获得字符的对应ASCII码

        //2.根据奇偶数分区
        if (result % 2 == 0) {
            return 0;//0区
        } else {
            return 1;//1区
        }
    }
}

五、测试数据

hello world
clickhouse clickhouse
hadoop
spark
hello world
clickhouse clickhouse
hadoop
spark
hello world
clickhouse clickhouse
hadoop
spark
clickhouse clickhouse
hadoop
spark

集群提交jar包命令

hadoop jar xx.jar driver.WordCount 参数1 参数2

参数1：文件或者目录

参数2：目录

时光不语,静待花开

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
【经典】MapReduce WordCount案例：需求1-4

* 需求一：在一堆给定的文本文件中统计输出每一个单词出现的总次数* 需求二：把单词按照ASCII码奇偶数分区* 需求三：对每一个maptask的输出做局部汇总* 方案一：写一个combiner类继承Reducer，其实是把WordCountReducer再写一次* 方案二：直接把WordCountReducer指定给CombinerClass* 需求四：大量小文件的切片优化一、...
复制链接

扫一扫

专栏目录