Spark 基本排序
数据:
41
2
45
21
4
7
5
2
4
524
41
代码:
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
object Sort {
System.setProperty("hadoop.home.dir","D:\\soft\\hadoop\\hadoop-2.9.2")
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[2]").setAppName("sort")
val sc = new SparkContext(conf)
val lineRdd = sc.textFile("F:/out/sort.txt")
//方式一
//((1,"")(3,"")(344,"")
// val pairRdd = lineRdd.map(line => (line.toInt,""))
// val sortRdd = pairRdd.sortByKey().map(tuple => tuple._1)
//方式二 (带有排序序号 方法为:zipWithIndex())
val sortRdd = lineRdd.sortBy(line => line.toInt).zipWithIndex().map(tuple => tuple.swap)
//重新分区:coalesce,repartition
//coalesce:主要场景(减少分区数),没有进行shuffle
//repartition:增加分区数或减少分区数,进行shuffle
// val resultRdd = sortRdd.repartition(1)
// val resultRdd = sortRdd.coalesce(1)
val resultRdd = sortRdd.partitionBy(new HashPartitioner(1))
// resultRdd.saveAsTextFile("d://output")
resultRdd.foreach(println)
sc.stop()
}
}
结果:
(0,2)
(1,2)
(2,4)
(3,4)
(4,5)
(5,7)
(6,21)
(7,41)
(8,41)
(9,45)
(10,524)