Mapreduce WordCount 统计排序

最新推荐文章于 2024-08-14 14:26:42 发布

xzh199308

最新推荐文章于 2024-08-14 14:26:42 发布

阅读量239

点赞数 3

本文链接：https://blog.csdn.net/qq_20101897/article/details/130567228

版权

大数据专栏收录该内容

5 篇文章 1 订阅

订阅专栏

统计代码：

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;
import java.util.StringTokenizer;

/**
 * @author
 */
public class ScoreCount {

    public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> {

        private final static IntWritable score = new IntWritable(0);
        private Text name = new Text();

        @Override
        public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
            StringTokenizer itr = new StringTokenizer(value.toString(), "\n");
            while (itr.hasMoreTokens()) {
                try {
                    StringTokenizer lineTokenizer = new StringTokenizer(itr.nextToken());
                    name.set(lineTokenizer.nextToken()); //姓名
                    score.set(Integer.parseInt(lineTokenizer.nextToken().trim()));//成绩
                    context.write(name, score);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }
    }

    public static class IntSumReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
        private IntWritable result = new IntWritable();

        @Override
        public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
            int sum = 0;
            // 分数做累加
            for (IntWritable val : values) {
                sum += val.get();
            }
            result.set(sum);
            context.write(key, result);
        }
    }

    private static class SortComparator extends IntWritable.Comparator {

        @Override
        public int compare(WritableComparable a, WritableComparable b) {
            return -super.compare(a, b);
        }

        @Override
        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
            return -super.compare(b1, s1, l1, b2, s2, l2);
        }
    }


    public static void main(String[] args) throws Exception {

        Configuration conf = new Configuration();
        //构建获取入参
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        //参数校验
        if (otherArgs.length < 2) {
            System.err.println("Usage: Scorecount <in> [<in>...] <out>");
            System.exit(2);
        }

        Job job = Job.getInstance(conf, "Score count");
        // 设置启动类
        job.setJarByClass(ScoreCount.class);
        // 设置map类
        job.setMapperClass(TokenizerMapper.class);
        // 设置Combiner类，也可以不设置
        job.setCombinerClass(IntSumReducer.class);
        // 设置reduce类
        job.setReducerClass(IntSumReducer.class);
        // 设置reduce的输出key和value的类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // 设置输入文件的目录
        for (int i = 0; i < otherArgs.length - 1; ++i) {
            FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
        }

        // 设置输出文件的目录
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));


        if (job.waitForCompletion(true)) {
            // 执行排序的mr任务
            SecoreSort.run(new String[]{otherArgs[otherArgs.length - 1], otherArgs[otherArgs.length - 1] + "2"});
        }
    }
}

排序代码：


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import java.io.IOException;
import java.util.StringTokenizer;

/**
 * @Description
 * @Author xiongzhenhai
 * @Date 2023/5/8 20:09
 */
public class SecoreSort {
    public static class SortIntValueMapper extends Mapper<LongWritable, Text, IntWritable, Text> {

        private final static IntWritable wordCount = new IntWritable(1);
        private Text word = new Text();

        @Override
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            StringTokenizer tokenizer = new StringTokenizer(value.toString());
            while (tokenizer.hasMoreTokens()) {
                word.set(tokenizer.nextToken().trim());
                wordCount.set(Integer.valueOf(tokenizer.nextToken().trim()));
                //<k,v>互换
                context.write(wordCount, word);
            }
        }
    }

    public static class SortIntValueReduce extends Reducer<IntWritable, Text, Text, IntWritable> {
        private Text result = new Text();

        @Override
        public void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
            for (Text val : values) {
                result.set(val.toString());
                context.write(result, key);//<k,v>互换
            }
        }
    }

    public static void run(String[] args) throws Exception {
        // 排序思路就是将上一步输出的kv互换，然后在reduce中换回来
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage: wordsort <in> <out>");
            System.exit(2);
        }

        Job job = new Job(conf, "word sort");
        job.setJarByClass(SecoreSort.class);

        job.setMapperClass(SortIntValueMapper.class);
        job.setReducerClass(SortIntValueReduce.class);

        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

       
        job.waitForCompletion(true);
        
    }

}