MapReduce编程之WordCount

最新推荐文章于 2020-04-27 10:45:38 发布

田小雨

最新推荐文章于 2020-04-27 10:45:38 发布

阅读量903

点赞数

分类专栏：大数据生态圈文章标签： MapReduce hadoop wordcount

本文链接：https://blog.csdn.net/sunlei1980/article/details/46473163

版权

大数据生态圈专栏收录该内容

36 篇文章 0 订阅

订阅专栏

 
        //mapreduce程序 
       
        import  
        java.io.IOException; 
       
        import  
        java.util.StringTokenizer; 
       
        import  
        org.apache.hadoop.conf.Configuration; 
       
        import  
        org.apache.hadoop.fs.Path; 
       
        import  
        org.apache.hadoop.io.IntWritable; 
       
        import  
        org.apache.hadoop.io.LongWritable; 
       
        import  
        org.apache.hadoop.io.Text; 
       
        import  
        org.apache.hadoop.mapreduce.Job; 
       
        import  
        org.apache.hadoop.mapreduce.Mapper; 
       
        import  
        org.apache.hadoop.mapreduce.Reducer; 
       
        import  
        org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 
       
        import  
        org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 
       
        public  
        class  
        WordCount { 
       
        /** 
       
        * TokenizerMapper 继续自 Mapper<LongWritable, Text, Text, IntWritable> 
       
        * 
       
        * [一个文件就一个map,两个文件就会有两个map] 
       
        * map[这里读入输入文件内容 以" \t\n\r\f" 进行分割，然后设置 word ==> one 的key/value对] 
       
        * 
       
        * @param Object  Input key Type: 
       
        * @param Text    Input value Type: 
       
        * @param Text    Output key Type: 
       
        * @param IntWritable Output value Type: 
       
        * 
       
        * Writable的主要特点是它使得Hadoop框架知道对一个Writable类型的对象怎样进行serialize以及deserialize. 
       
        * WritableComparable在Writable的基础上增加了compareT接口，使得Hadoop框架知道怎样对WritableComparable类型的对象进行排序。 
       
        * 
       
        * @ author liuqingjie 
       
        * 
       
        */ 
       
        public  
        static  
        class  
        TokenizerMapper 
       
        extends  
        Mapper<LongWritable, Text, Text, IntWritable>{ 
       
        private  
        final  
        static  
        IntWritable one =  
        new  
        IntWritable( 
        1 
        ); 
       
        private  
        Text word =  
        new  
        Text(); 
       
        public  
        void  
        map(LongWritable key, Text value, Context context)  
        throws  
        IOException, InterruptedException { 
       
        StringTokenizer itr =  
        new  
        StringTokenizer(value.toString()); 
       
        while  
        (itr.hasMoreTokens()) { 
       
        word.set(itr.nextToken()); 
       
        context.write(word, one); 
       
        } 
       
        } 
       
        } 
       
        /** 
       
        * IntSumReducer 继承自 Reducer<Text,IntWritable,Text,IntWritable> 
       
        * 
       
        * [不管几个Map,都只有一个Reduce,这是一个汇总] 
       
        * reduce[循环所有的map值,把word ==> one 的key/value对进行汇总] 
       
        * 
       
        * 这里的key为Mapper设置的word[每一个key/value都会有一次reduce] 
       
        * 
       
        * 当循环结束后，最后的确context就是最后的结果. 
       
        * 
       
        * @author liuqingjie 
       
        * 
       
        */ 
       
        public  
        static  
        class  
        IntSumReducer 
       
        extends  
        Reducer<Text,IntWritable,Text,IntWritable> { 
       
        private  
        IntWritable result =  
        new  
        IntWritable(); 
       
        public  
        void  
        reduce(Text key, Iterable<IntWritable> values, 
       
        Context context 
       
        )  
        throws  
        IOException, InterruptedException { 
       
        int  
        sum =  
        0 
        ; 
       
        for  
        (IntWritable val : values) { 
       
        sum += val.get(); 
       
        } 
       
        result.set(sum); 
       
        context.write(key, result); 
       
        } 
       
        } 
       
        public  
        static  
        void  
        main(String[] args)  
        throws  
        Exception { 
       
        Configuration conf =  
        new  
        Configuration(); 
       
        if  
        (args.length !=  
        2 
        ) { 
       
        System.err.println( 
        "请配置路径  " 
        ); 
       
        System.exit( 
        2 
        ); 
       
        } 
       
        Job job =  
        new  
        Job(conf,  
        "wordcount" 
        ); 
       
        job.setJarByClass(WordCount. 
        class 
        ); 
        //主类 
       
        job.setMapperClass(TokenizerMapper. 
        class 
        ); 
        //mapper 
       
        job.setReducerClass(IntSumReducer. 
        class 
        ); 
        //reducer 
       
        job.setMapOutputKeyClass(Text. 
        class 
        ); 
        //设置map输出数据的关键类 
       
        job.setMapOutputValueClass(IntWritable. 
        class 
        ); 
        //设置map输出值类 
       
        job.setOutputKeyClass(Text. 
        class 
        ); 
        //设置作业输出数据的关键类 
       
        job.setOutputValueClass(IntWritable. 
        class 
        ); 
        //设置作业输出值类 
       
        FileInputFormat.addInputPath(job,  
        new  
        Path(otherArgs[ 
        0 
        ])); 
        //文件输入 
       
        FileOutputFormat.setOutputPath(job,  
        new  
        Path(otherArgs[ 
        1 
        ])); 
        //文件输出 
       
        System.exit(job.waitForCompletion( 
        true 
        ) ?  
        0  
        :  
        1 
        ); 
        //等待完成退出. 
       
        } 
       
        }

编写过程分析：

（1）数据类型

整型：IntWritable，这是Hadoop对int的封装

字符串型：Text，这是Hadoop对String的封装

上下文对象：Context，它用来与MapReduce系统进行通信，如把map的结果传给reduce处理

（2）执行过程

分为两个阶段：map阶段和reduce阶段, 以key/value为输入输出，其中key、value的类型可以由程序员自定义。

map编写：

自定义一个类，继承于基类Mapper，该基类是一个泛型，有4个形参类型：用来指定map函数的输入键、输入值，输出键、输出值，格式如下：public class Mapper<KEYIN,VALUEIN,KEYOUT,VALUEOU>。

根据实际需要，重写map函数，函数类型由Mapper指定。每一对<key,value>调用一次map函数。

wordcount程序中，map方法中的value值存储的是文本文件中的一行，key值为该行的首字符相对于文本文件首字符的偏移量，在本程序中，key值未使用。StringTokenizer类是将每一行拆分为一个个的单词。

reduce编写：

自定义一个类，继承于基类Reducer，该基类是一个泛型，有4个形参类型：用来指定reduce函数的输入键、输入值，输出键、输出值，格式public class Reducer<KEYIN,VALUEIN,KEYOUT,VALUEOUT>，其中reduce的输入类型必须与map的输出类型一致。

根据实际需要，重写reduce方法，方法的类型由Reducer指定。每一个key调用一次reduce方法。

主函数编写：

在主函数中进行作业的配置，主要配置有：

 
        Job job =  
        new  
        Job(conf,  
        "word count" 
        ); 
       
        job.setJarByClass(WordCount. 
        class 
        ); 
        //主类 
       
        job.setMapperClass(TokenizerMapper. 
        class 
        ); 
        //mapper 
       
        job.setReducerClass(IntSumReducer. 
        class 
        ); 
        //reducer 
       
        job.setMapOutputKeyClass(Text. 
        class 
        ); 
        //设置map输出数据的关键类 
       
        job.setMapOutputValueClass(IntWritable. 
        class 
        ); 
        //设置map输出值类 
       
        job.setOutputKeyClass(Text. 
        class 
        ); 
        //设置作业输出数据的关键类 
       
        job.setOutputValueClass(IntWritable. 
        class 
        ); 
        //设置作业输出值类 
       
        FileInputFormat.addInputPath(job,  
        new  
        Path(otherArgs[ 
        0 
        ])); 
        //文件输入 
       
        FileOutputFormat.setOutputPath(job,  
        new  
        Path(otherArgs[ 
        1 
        ])); 
        //文件输出 
       
        System.exit(job.waitForCompletion( 
        true 
        ) ?  
        0  
        :  
        1 
        ); 
        //等待完成退出.