package interview
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object TwoSort {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
.setMaster("local[*]")
.setAppName(this.getClass.getCanonicalName)
// 要实现全局排序,reduce端的并行度只能为1
.set("spark.default.parallelism","1")
.set("spark.sql.shuffle.partitions","1")
val sc = new SparkContext(conf)
val nums: RDD[String] = sc.textFile("data/input/twosorts.txt")
val twoSortedRDD: RDD[(Int, Int)] = nums.map(line => {
val nums: Array[String] = line.split(" ")
(Integer.parseInt(nums(0)), Integer.parseInt(nums(1)))
})
// 注意要先分组,后排序
.groupByKey()
.sortByKey(true)
.map(kv => {
(kv._1, kv._2.toList.sortWith(_ > _))
})
.flatMap(kv => {
kv._2.map(v => kv._1 -> v)
})
twoSortedRDD.foreach(println)
sc.stop()
}
}
数据:
20 21
50 51
50 54
60 51
60 53
70 58
60 61
70 54
70 57
70 58
60 61
70 54
70 57
70 58
10 55