Spark分组取topN与二次排序
分区取topN
将排序数据全部加载至内存
测试数据
class1 100
class2 85
class3 70
class1 102
class2 65
class1 45
class2 85
class3 70
class1 16
class2 88
class1 95
class2 37
class3 98
class1 99
class2 23
方法一:将所有待排序的数据加载至内存,然后进行排序,取出前N个数据
val rdd1: RDD[String] = sc.textFile("xx\\class.txt")
rdd1.map(line => (line.split(" ")(0),line.split(" ")(1).toInt))
.groupByKey().map(item => {
//将数据全部加载
val top3 = item._2.toList.sorted(Ordering.Int.reverse).take(3)
(item._1,Map("前三"-> top3))
}).foreach(println)
方法二:对数据逐条过滤,将前三条选出
JavaRDD lines = sc.textFile("xx\\class.txt");
JavaPairRDD rdd = lines.mapToPair(line -> new Tuple2<>(line.split(" ")[0], line.split(" ")[1]));
rdd.groupByKey().foreach(k -> {
String className = k._1;
Iterator iterator = k._2.iterator();
Integer[] top3 = new Integer[3];
//避免了将全部数据加载到内存中
while (iterator.hasNext()){
Integer score = Integer.valueOf(iterator.next());
for (int i = 0; i < top3.length; i++) {
if (top3[i] == null){
top3[i] = score;
break;
}else if (score > top3[i]){
for (int j = 2; j > i; j--) {
top3[j] = top3[j-1];
}
top3[i] = score;
break;
}
}
}
System.out.println("Class Name: " + className);
Arrays.stream(top3).forEach(System.out::println);
});
测试数据
1,111,68,69,90,1班,经济系
2,112,73,80,96,1班,经济系
3,113,90,74,75,1班,经济系
4,114,89,94,93,1班,经济系
5,115,99,93,89,1班,经济系
6,121,96,74,79,2班,经济系
7,122,89,86,85,2班,经济系
8,123,70,78,61,2班,经济系
9,124,76,70,76,2班,经济系
10,211,89,93,60,1班,外语系
11,212,76,83,75,1班,外语系
12,213,71,94,90,1班,外语系
13,214,94,94,66,1班,外语系
14,215,84,82,73,1班,外语系
15,216,85,74,93,1班,外语系
16,221,77,99,61,2班,外语系
17,222,80,78,96,2班,外语系
18,223,79,74,96,2班,外语系
19,224,75,80,78,2班,外语系
20,225,82,85,63,2班,外语系
方式一:全部加载至内存
val studentsScore: RDD[String] = sc.textFile("xx\\top2.txt")
val groups: RDD[(String, Int, Int, Int, String, String)] = studentsScore.map(scoreInfo => {
val info = scoreInfo.split(",")
(info(1), info(2).toInt, info(3).toInt, info(4).toInt, info(5), info(6))
})
/**
* 多次分组取TopK
* 根据学系与班级分组
* 这里的groupBy算子利用的非常好
*/
val result: RDD[(String, String, Map[String, List[String]])] = groups.groupBy(item => (item._6, item._5)).map(subG => {
val (departmentId, classId) = subG._1 //相当于分组后的key
//语文前三
val languageTopK = subG._2.toList.sortBy(_._2)(Ordering.Int.reverse).take(3)
.map(item => item._2 + " 分: 学号" + item._1)
val mathTopK = subG._2.toList.sortBy(_._3)(Ordering.Int.reverse).take(3)
.map(item => item._3 + " 分: 学号" + item._1)
val englishTopK = subG._2.toList.sortBy(_._4)(Ordering.Int.reverse).take(3)
.map(item => item._4 + " 分: 学号" + item._1)
(departmentId, classId, Map("语文前三" -> languageTopK, "数学前三" -> mathTopK, "英语前三" -> englishTopK))
})
result.foreach(println)
方式二
JavaRDD rdd = sc.textFile("xx\\top2.txt");
JavaRDD> rdd1 = rdd.map(item -> {
String[] words = item.split(",");
return new Tuple6<>(words[1], Integer.valueOf(words[2]), Integer.valueOf(words[3]), Integer.valueOf(words[4]), words[5], words[6]);
});
JavaPairRDD>> rdd2 = rdd1.groupBy(item -> new Tuple2(item._6(), item._5()));
rdd2.foreach(item -> {
Iterator> iterator = item._2.iterator();
Integer[] languageTop3 = new Integer[3];
//避免了将全部数据加载到内存中
while (iterator.hasNext()){
Tuple6 tuple6 = iterator.next();
Integer score = tuple6._2();
for (int i = 0; i < languageTop3.length; i++) {
if (languageTop3[i] == null){
languageTop3[i] = score;
break;
}else if (score > languageTop3[i]){
for (int j = 2; j > i; j--) {
languageTop3[j] = languageTop3[j-1];
}
languageTop3[i] = score;
break;
}
}
}
System.out.println("=====" + item._1._1() + ", " + item._1._2() + "=====");
for (int i = 0; i < languageTop3.length; i++) {
System.out.print(languageTop3[i] + ", ");
}
System.out.println();
});
二次排序
可以自定义一个类实现Ordered,相当于java中继承Comparable接口
class SeveralSortKey(val arr:Array[String]) extends Ordered[SeveralSortKey] with Serializable{
//重写Ordered类的compare方法
override def compare(that: SeveralSortKey): Int = {
val loop = new Breaks
var result:Int = -1
loop.breakable {
for (i
if (this.arr(i).toInt - that.arr(i).toInt != 0) {
result = this.arr(i).toInt - that.arr(i).toInt
loop.break()
}else{
result = this.arr(i+1).toInt - that.arr(i+1).toInt
}
}
}
result
}
}