Spark分组取TopN

1、对文本文件中的数字,获取最大的前三个。

代码实例:

package com.netcloud.spark.sparkcore.projectpractice;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;

import java.util.List;

/** java版本
 * 对文本文件中的数字,获取最大的前三个
 *
 * @author yangshaojun
 * #date  2019/3/15 16:38
 * @version 1.0
 */
public class Demo_002_TopThree {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf();
        conf.setMaster("local").setAppName("Demo_001_SparkSecondarySort");
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaRDD<String> lineRDD = sc.textFile("data/sparkcore/top3.txt");
        //将读取的RDD<String>类型 映射为RDD<Integer,String>
        JavaPairRDD<Integer, String> integerRDD = lineRDD.mapToPair(new PairFunction<String, Integer, String>() {
            @Override
            public Tuple2<Integer, String> call(String s) throws Exception {
                return new Tuple2<Integer, String>(Integer.valueOf(s), s);
            }
        });
        //降序排序
        JavaPairRDD<Integer, String> sortNumberRDD = integerRDD.sortByKey(false);
        //将 RDD<Integer,String> 映射为 RDD<String>类型
        JavaRDD<String> retRDD = sortNumberRDD.map(new Function<Tuple2<Integer, String>, String>() {

            @Override
            public String call(Tuple2<Integer, String> v1) throws Exception {
                return v1._2;
            }
        });
        //获取前三条数据
        List<String> beforeThree = retRDD.take(3);
        //遍历打印输出结果
        for (String ret : beforeThree) {
            System.out.println(ret);
        }
        sc.stop();

    }
}


package com.netcloud.bigdata.spark_core.basiclearning.projectpractice

import org.apache.spark.{SparkConf, SparkContext}

/** scala版本
  * 对文本文件中的数字,获取最大的前三个
  *
  * @author yangshaojun
  * #date  2019/3/15 16:40
  * @version 1.0
  */
object Demo_002_TopThree {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local").setAppName("Demo_002_TopThree")
    val sc = new SparkContext(conf)
    val lineRDD = sc.textFile("data/sparkcore/top3.txt")
    val kvRDD = lineRDD.map(num => (num.toInt, num))
    val sortRDD = kvRDD.sortByKey(false)
    val normalRDD = sortRDD.map(kv => kv._2)
    val list = normalRDD.take(3)
    for (ret <- list) {
      println(ret)
    }
  }
}

2、对每个班级内的学生成绩,取出前三名。(分组TopN)

代码实例:

package com.netcloud.spark.sparkcore.projectpractice;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;

import java.util.Arrays;
import java.util.Iterator;

/**
 * 对每个班级内的学生成绩,取出前三名。(分组TopN)
 *
 * @author yangshaojun
 * #date  2019/3/15 17:05
 * @version 1.0
 */
public class Demo_003_GroupTopThree {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf();
        conf.setMaster("local").setAppName("Demo_003_GroupTopThree");
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaRDD<String> lineRDD = sc.textFile("data/sparkcore/score.txt");
        JavaPairRDD<String, Integer> kvRDD = lineRDD.mapToPair(new PairFunction<String, String, Integer>() {
            @Override
            public Tuple2<String, Integer> call(String s) throws Exception {

                return new Tuple2<String, Integer>(s.split(",")[0].toString(), Integer.valueOf(s.split(",")[1]));
            }
        });

        JavaPairRDD<String, Iterable<Integer>> groupPairRDD = kvRDD.groupByKey();
        JavaPairRDD<String, Iterable<Integer>> top3RDD = groupPairRDD.mapToPair(new PairFunction<Tuple2<String, Iterable<Integer>>, String, Iterable<Integer>>() {

            @Override
            public Tuple2<String, Iterable<Integer>> call(Tuple2<String, Iterable<Integer>> t) throws Exception {
                String calssName = t._1;
                Integer[] top3 = new Integer[3];
                Iterator<Integer> scores = t._2.iterator();
                while (scores.hasNext()) {
                    Integer score = scores.next();
                    for (int i = 0; i < 3; i++) {
                        if (top3[i] == null) {
                            top3[i] = score;
                            break;
                        } else if (score > top3[i]) {
                            int tmp = top3[i];
                            top3[i] = score;
                            if (i < top3.length - 1) {
                                top3[i + 1] = tmp;
                            }
                            break;
                        }

                    }

                }

                return new Tuple2<String, Iterable<Integer>>(calssName, Arrays.asList(top3));
            }
        });

        top3RDD.foreach(new VoidFunction<Tuple2<String, Iterable<Integer>>>() {
            @Override
            public void call(Tuple2<String, Iterable<Integer>> t) throws Exception {
                String cassName = t._1;
                Iterator<Integer> scores = t._2.iterator();
                while (scores.hasNext()) {
                    Integer score = scores.next();
                    System.out.println(cassName + ":" + score);
                }

            }
        });

    }
}

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值