【经典】MapReduce WordCount案例:需求1-4

* 需求一:在一堆给定的文本文件中统计输出每一个单词出现的总次数
* 需求二:把单词按照ASCII码奇偶数分区
* 需求三:对每一个maptask的输出做局部汇总
*  方案一:写一个combiner类继承Reducer,其实是把WordCountReducer再写一次
*  方案二:直接把WordCountReducer指定给CombinerClass
* 需求四:大量小文件的切片优化

一、mapper阶段

package mapper;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * @ClassName WordCountMapper
 * @Description ToDo
 * @Author qin
 * @Date 2019/7/18 10:27
 * @Version 1.0
 **/

/**
 * 需求一:在一堆给定的文本文件中统计输出每一个单词出现的总次数
 * 需求二:把单词按照ASCII码奇偶数分区
 * 需求三:对每一个maptask的输出做局部汇总
 *  方案一:写一个combiner类继承Reducer,其实是把WordCountReducer再写一次
 *  方案二:直接把WordCountReducer指定给CombinerClass
 * 需求四:大量小文件的切片优化
 * key-in:所读文本的行其实偏移量==》用序列化结构LongWritable
 * value-in:所读文本的内容==》Text
 * key-out:单词==》Text
 * value-out:单词次数==》IntWritable
 *
 */
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

    //map阶段的业务逻辑写在自定义的map方法中
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //1.将maptask 传给我们的文本内容转为String
        String line = value.toString();

        //2.将string进行切分
        String[] split = line.split(" ");

        //3.将单词map为key-value对,并输出
        for (String word : split) {
            context.write(new Text(word), new IntWritable(1));
        }
    }
}

二、Reduce阶段

package reducer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @ClassName WordCountReducer
 * @Description ToDo
 * @Author qin
 * @Date 2019/7/18 11:29
 * @Version 1.0
 **/
public class WordCountReducer extends Reducer<Text, IntWritable, Text,IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int count = 0;
        //1.汇总各个key出现的次数
        for (IntWritable value:values){
            count += value.get();
        }

        //2.输出
        context.write(key,new IntWritable(count));
    }
}

 

三、driver阶段

package driver;


import mapper.WordCountMapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import partitioner.WordCountPartitioner;
import reducer.WordCountReducer;

import java.io.IOException;

/**
 * @ClassName WordCount
 * @Description ToDo
 * @Author qin
 * @Date 2019/7/18 11:38
 * @Version 1.0
 **/
public class WordCount {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //1.获取配置信息,创建job实例
        Configuration configuration = new Configuration();
        Job job = Job.getInstance(configuration);

        //2.指定本程序的jar包所在的本地路径
        job.setJarByClass(WordCount.class);

        //3.关联mapper,reducer业务类
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);

        //4.指定mapper输出数据的kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //5.指定最终输出数据的kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //6.指定job的输入原始文件所在目录
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //需求二补:加载分区
        job.setPartitionerClass(WordCountPartitioner.class);
        job.setNumReduceTasks(2);//将会影响输出文件个数
        //需求三补:
        job.setCombinerClass(WordCountReducer.class);
        //需求四:
        job.setInputFormatClass(CombineTextInputFormat.class);
        CombineTextInputFormat.setMaxInputSplitSize(job,4194304);//4M
        CombineTextInputFormat.setMinInputSplitSize(job,2097152);//2M


        //7.提交
        job.submit();
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);

    }
}

四、自定义分区(需求二)

package partitioner;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

/**
 * 把单词按照 ASCII奇偶数分区
 * @ClassName WordCountPartitioner
 * @Description ToDo
 * @Author qin
 * @Date 2019/7/18 17:16
 * @Version 1.0
 **/
public class WordCountPartitioner extends Partitioner<Text, IntWritable> {
    @Override
    public int getPartition(Text text, IntWritable intWritable, int num) {
        //1.获取单词key首字母
        String substring = text.toString().substring(0, 1);
        char[] chars = substring.toCharArray();
        int result = chars[0];//int类型能直接获得字符的对应ASCII码

        //2.根据奇偶数分区
        if (result % 2 == 0) {
            return 0;//0区
        } else {
            return 1;//1区
        }
    }
}

五、测试数据

hello world
clickhouse clickhouse
hadoop
spark
hello world
clickhouse clickhouse
hadoop
spark
hello world
clickhouse clickhouse
hadoop
spark
clickhouse clickhouse
hadoop
spark

集群提交jar包命令

hadoop jar xx.jar driver.WordCount 参数1 参数2

参数1:文件或者目录

参数2:目录

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值