需求
代码
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object TopN {
def main(args: Array[String]): Unit = {
//创建SparkConf对象
val config: SparkConf = new SparkConf().setMaster("local[*]").setAppName("SecondSort")
//创建Spark上下文对象
val sc = new SparkContext(config)
//读取文件 将文件内容一行一行的读取出来
val line: RDD[String] = sc.textFile("data1.txt")
//读取文件 将文件内容一行一行的读取出来
val words: RDD[String] = line.flatMap(_.split(" "))
val mapRDD: RDD[(String, String)] = line.map(x => {
val words: Array[String] = x.split(" ")
(words(0), words(1))
})
// val value: RDD[(String, String)] = mapRDD.sortByKey(true,1)
//分组排序
val groupRDD: RDD[(String, Iterable[String])] = mapRDD.groupByKey()
// groupRDD.foreach(println(_))
/*
(class3,CompactBuffer(41, 65, 75, 100, 34, 14))
(class1,CompactBuffer(100, 87, 76, 95, 74))
(class4,CompactBuffer(95, 15, 45, 77, 88, 99))
(class2,CompactBuffer(56, 88, 87, 99, 77))
*/
val Top3: RDD[(String, List[Int])] = groupRDD.map(x => {
//转为List
val list: List[String] = x._2.toList
//List中的元素String—>Int,sorted.reverse.take(3) 倒序 取3个
val scores: List[Int] = list.map(x => x.toInt).sorted.reverse.take(3)
(x._1, scores)
})
Top3.foreach(println(_))
/** 输出结果
* (class3,List(100, 75, 65))
* (class4,List(99, 95, 88))
* (class1,List(100, 95, 87))
* (class2,List(99, 88, 87))
*/
}
}
输出
(class3,List(100, 75, 65))
(class4,List(99, 95, 88))
(class1,List(100, 95, 87))
(class2,List(99, 88, 87))
源码
Top N.scala