使用MapReduce对豆瓣的数据评分进行排序，最终只要：电影名,评分两个字段。（温馨提示：数据需要去重）

本文链接：https://blog.csdn.net/m0_46299185/article/details/121319545

该博客介绍了如何使用Java的MapReduce实现对豆瓣数据的评分进行排序，同时通过覆盖Map中相同key的方式进行数据去重。在Map阶段，通过正则表达式筛选出评分并创建自定义Stu对象。在Reduce阶段，输出排序后的电影名和评分。完整代码包括Mapper和Reducer类，以及实现了WritableComparable接口的Stu类。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

使用MapReduce对豆瓣的数据评分进行排序，最终只要：电影名,评分两个字段。（温馨提示：数据需要去重）

一、

首先讲一下Java中的compareTo方法：

利用CompareTo时，其返回值为 0，1，-1。
1 表示两个数交换顺序，-1 表示不交换顺序。
0 表示表示两者重复，不交换顺序，但表示两个元素相同，发生覆盖进而造成数据丢失，是个坑。而在map中比较的是key，发现相同会进行覆盖。可以利用这个坑顺利去重。

代码演示：（排序以及去重部分）

 if (this.score >  o.score){    //根据score进行排序
                return 1;                      //如果if结果正确返回1 表示两者交换顺序，即升序
            }else if(this.score <  o.score){
                return -1;                    //如果else if结果正确返回-1 表示两者不交换顺序，也为升序
            }else if (this.name.compareTo(o.name)==0){   //根据name进行排序
                return 0;                     //如果else if 结果正确返回0 表示两者重复，不交换顺序，而在map中比较的是key，发现相同会进行覆盖。顺利去重。
            }else if(this.name.compareTo(o.name)<0){
                return 1;                   //如果else if 结果正确返回1 表示两者交换顺序，即为降序
            }

二、

另外在筛选字段时，需要简单用到正则表达式，

boolean result=strings[1].matches("\\d+\\.+\\d+");//利用正则表达式筛选评分
            if (result == true) {
                Stu stu = new Stu(strings[0], Double.parseDouble(strings[1]));
                context.write(stu, NullWritable.get());
            }

三、

全文代码需要用到两个.java(更改文件名，代码可直接使用)：

/**
 * @Time : 2021/11/10 20:31
 * @Auther : Carapace
 * @File : ComparableCode.java
 * Software: IntelliJ IDEA
 */

package com.Comparable;


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * @author: houda
 * @description: MR的排序（单排序，二次排序） 和 局部聚合
 * @date: 2021/11/10
 * <p>
 * <p>
 *   需求1： 使用对象序列化方式 根据学生成绩进行降序排列。
 *
 *   需求2： 使用对象序列化方式 根据不同班级对学生成绩进行降序排列。 (二次排序)
 */
public class ComparableCode {
    //    map task  在MR中，会默认使用KEYOUT进行排序（KEYIN, VALUEIN, KEYOUT, VALUEOUT）
    public static class MapComparable extends Mapper<LongWritable, Text, Stu, NullWritable> {
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Stu, NullWritable>.Context context) throws IOException, InterruptedException {
            String[] strings = value.toString().split("\t");

            boolean result=strings[1].matches("\\d+\\.+\\d+");//利用正则表达式筛选评分
            if (result == true) {                                   //判断结果正确，
                Stu stu = new Stu(strings[0], Double.parseDouble(strings[1]));//选取数组中的 0，1，name，score
                context.write(stu, NullWritable.get());
            }
            //Stu stu = new Stu(strings[0], Double.parseDouble(strings[1]));
            //context.write(stu, NullWritable.get());
        }
    }

    public static class ReduceComparable extends Reducer<Stu, NullWritable,Text,NullWritable> {
        @Override
        protected void reduce(Stu key, Iterable<NullWritable> values, Reducer<Stu, NullWritable, Text, NullWritable>.Context context) throws IOException, InterruptedException {
            context.write(new Text(key.toString()), NullWritable.get());
        }
    }


    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        Job job = Job.getInstance();
        job.setJarByClass(ComparableCode.class);

        job.setMapperClass(MapComparable.class);
        job.setMapOutputKeyClass(Stu.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setReducerClass(ComparableCode.ReduceComparable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);



        String inPath = "./datas/douban.txt";
        String outPath = "./outStudentClassSortScore";
        FileInputFormat.addInputPath(job,new Path(inPath));
        FileOutputFormat.setOutputPath(job,new Path(outPath));

//        自动删除输出路径的目录
        FileSystem fs = FileSystem.get(new Configuration());
        if (fs.exists(new Path(outPath))){
            fs.delete(new Path(outPath),true);
        }

        job.waitForCompletion(true);
    }



}

/**
 * @Time : 2021/11/10 20:33
 * @Auther : Carapace
 * @File : Stu.java
 * Software: IntelliJ IDEA
 */

package com.Comparable;


import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * @author: houda
 * @description: 学生类
 * @date: 2021/11/10
 */
public class Stu implements WritableComparable<Stu> {
//    private String name;
//    private String cls;
//    private double score;

    private String name;
    private double score;

    public Stu() {
    }

    public Stu(String name, double score) {
//        this.name = name;
//        this.cls = cls;
//        this.score = score;
        this.name=name;
        this.score=score;

    }

    @Override
//    比较器
    public int compareTo(Stu o) {
//        单排序
//        return this.score > o.score ? -1 : 1;
//        二次排序 实现需求，使用对象序列化方式 根据不同班级对学生成绩进行降序排列。 (二次排序)
//        先比较 是否是相同班级，是同一个班级 进行分数比较，否侧不比较。
//        if (this.cls.compareTo(o.cls)>0){
//            return 1;
//        }else if(this.cls.compareTo(o.cls)<0){
//            return -1;
//        }
//        return this.score > o.score ? -1 : 1;


            if (this.score >  o.score){    //根据score进行排序
                return 1;                      //如果if结果正确返回1 表示两者交换顺序，即为升序
            }else if(this.score <  o.score){
                return -1;                    //如果else if结果正确返回-1 表示两者不交换顺序，也为升序
            }else if (this.name.compareTo(o.name)==0){   //根据name进行排序
                return 0;                     //如果else if 结果正确返回0 表示两者重复，不交换顺序，而在map中比较的是key，发现相同会进行覆盖。顺利去重。
            }else if(this.name.compareTo(o.name)<0){
                return 1;                   //如果else if 结果正确返回1 表示两者交换顺序，即为降序
            }
            return -1;

        }



    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(name);
        //out.writeUTF(cls);
        out.writeDouble(score);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.name = in.readUTF();
        //this.cls = in.readUTF();
        this.score = in.readDouble();
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

//    public String getCls() {
//        return cls;
//    }
//
//    public void setCls(String cls) {
//        this.cls = cls;
//    }

    public double getScore() {
        return score;
    }

    public void setScore(double score) {
        this.score = score;
    }

    @Override
    public String toString() {
//        return name + "\t" + cls + "\t" + score;
        return name + "\t" +  "\t" + score;

    }
}