WordCount代码模板

最新推荐文章于 2024-05-10 10:39:28 发布

白墨Blake

最新推荐文章于 2024-05-10 10:39:28 发布

阅读量369

点赞数

分类专栏： Java学习

本文链接：https://blog.csdn.net/ilovehua521/article/details/84779515

版权

Java学习专栏收录该内容

60 篇文章 2 订阅

订阅专栏

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


import java.io.IOException;

public class WordCount {

    /**
     *  输入key:行首字母索引  long  LongWritable
     *  输入value:每一行数据  String  Text
     *
     *  输出key:每个单词      String  Text
     *  输入出value:1         int  IntWritable
     */
    private static class WordCountMapper extends Mapper<LongWritable,Text,Text,IntWritable>{

        // 设置输出的 key-value 对象
        private Text outputKey = new Text();
        private IntWritable outputValue = new IntWritable(1);

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

            // 对于每一组输入的 key-value 都会调用一次 map 方法进行计算
            // 在这里编写数据计算的代码,把输入的 key-value 转化为输出的 key-value

            // 1.获取每一行内容,从 Text 转化为 String ,方便后续处理
            String line = value.toString();

            //  2.按照空白字符拆分每一行内容,获取每一个单词
            // \\s+ 多个连续的空白字符
            String words[] = line.split("\\s+");

            // 3.遍历数组,获取每一个单词
            for (String word:words) {

                //  4.以单词为 key ,数组 1 为 value 输出计算结果
                // 设置输出的 key
                outputKey.set(word);

                // 设置输入的 value,创建的时候已经设置过值了,这里就略过了

                // 输出计算结果
                context.write(outputKey,outputValue);
            }
        }
    }




    /**
     *  输入key:每个单词   String  Text
     *  输入value:1        int  IntWritable
     *
     *  输出key:每个单词      String  Text
     *  输入出value:词频         int  IntWritable
     */
    private static class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {

        // 设置输出的 value 对象
        private IntWritable outputValue = new IntWritable();

        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {

            // 对于每一组输入的 key-value 都会调用一次 reduce 方法进行计算
            // 在这里编写数据计算的代码,把输入的 key-value 转化为输出的 key-value
            // map 输出的kv经过聚合后形成新的kv交给reduce处理
            // hello 1 ,hello 1,hello 1 聚合成新的 kv hello:[1,1,1]

            // 1.通过遍历计算集合的长度
            int count = 0;
            for (IntWritable value:values) {

                count++;
            }

            // 2.以单词为 key ,count 为 value 输出计算结果
            // 设置outputKey
            outputValue.set(count);

            // 输出计算结果
            context.write(key,outputValue);
        }
    }


    public static void main(String[] args) {

        // 通过 job 拼装 map 和 reduce

        //  1. 设置集群位置
        Configuration conf = new Configuration();

        conf.set("fs.defaultFS", "hdfs://ns1");
        conf.set("dfs.nameservices", "ns1");
        conf.set("dfs.ha.namenodes.ns1", "nn1,nn2");
        conf.set("dfs.namenode.rpc-address.ns1.nn1", "master:9000");
        conf.set("dfs.namenode.rpc-address.ns1.nn2", "slave1:9000");
        conf.set("dfs.client.failover.proxy.provider.ns1", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");

        try {

            // 2.获取 job 对象,并捕获异常
            Job job = Job.getInstance(conf);

            // 设置 job 名字和主类
            job.setJobName("word-count");
            job.setJarByClass(WordCount.class);

            // 设置mapper和reduce类
            job.setMapperClass(WordCountMapper.class);
            job.setReducerClass(WordCountReducer.class);

            // 设置 mapper 的 kv 输出类型
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(IntWritable.class);

            // 设置 reduce 的 kv 输出类型
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);

            // 设置待计算的数据位置
            Path inputPath = new Path("/word.txt");
            FileInputFormat.addInputPath(job,inputPath);

            // 设置计算结果的保存位置
            Path outputPath = new Path("/word-count-2");
            // 执行程序前,必须确保 /word-count-2 文件不存在,否则会报错
            FileSystem.get(conf).delete(outputPath,true);
            FileOutputFormat.setOutputPath(job,outputPath);

            // 提交job 等待执行结构,需要捕获异常
            job.waitForCompletion(true);

        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        }
    }
}

白墨Blake

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
WordCount代码模板

import org.apache.hadoop.conf.Configuration;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hadoop.io.IntWritable;import org.apache.hadoop.io.LongWritable...
复制链接

扫一扫