方法1
自定义key
class SecordSortKey(val firstKey: Int, val secondKey: Int)extends Ordered[SecordSortKey] with Serializable{
override def compare(that: SecordSortKey):Int = {
if(this.firstKey != that.firstKey) {
this.firstKey - that.firstKey
}else {
this.secondKey - that.secondKey
}
}
}
实现
object SecordSortTest {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[2]").setAppName("SecordSort")
val sc = new SparkContext(conf);
val lines = sc.textFile("test");
//第二步:将要进行二次排序的数据加载,按照<key,value>格式的RDD
val pairSortKey = lines.map { line =>
(new SecordSortKey(line.split(" ")(0).toInt, line.split(" ")(1).toInt),line)
};
//第三步:使用sortByKey 基于自定义的key进行二次排序
val sortPair = pairSortKey.sortByKey(false);
val sortResult = sortPair.map(line=>line._2);
sortResult.collect().foreach { x => print(x) };
}
}
方法2
[root@iteblog.com /tmp]# vim data.txt
2015,1,24
2015,3,56
2015,1,3
2015,2,-43
2015,4,5
2015,3,46
2014,2,64
2015,1,4
2015,1,21
2015,2,35
2015,2,0
我们期望的输出结果是
2014-2 64
2015-1 3,4,21,24
2015-2 -43,0,35
2015-3 46,56
2015-4 5
scala> val file = sc.textFile("/tmp/data.txt")
file: org.apache.spark.rdd.RDD[String] = /tmp/data.txt MapPartitionsRDD[1] at textFile at <console>:27
scala> val data = file.map(_.split(",")).map(item => (s"${item(0)}-${item(1)}", item(2)))
data: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[3] at map at <console>:29
scala> data.collect().foreach(println)
(2015-1,24)
(2015-3,56)
(2015-1,3)
(2015-2,-43)
(2015-4,5)
(2015-3,46)
(2014-2,64)
(2015-1,4)
(2015-1,21)
(2015-2,35)
(2015-2,0)
scala> val rdd = data.groupByKey
rdd: org.apache.spark.rdd.RDD[(String, Iterable[String])] = ShuffledRDD[5] at groupByKey at <console>:31
scala> rdd.collect().foreach(println)
(2014-2,CompactBuffer(64))
(2015-1,CompactBuffer(24, 3, 4, 21))
(2015-2,CompactBuffer(35, 0, -43))
(2015-3,CompactBuffer(56, 46))
(2015-4,CompactBuffer(5))
scala> val result = rdd.map(item => (item._1, item._2.toList.sortWith(_.toInt<_.toInt)))
result: org.apache.spark.rdd.RDD[(String, List[String])] = MapPartitionsRDD[20] at map at <console>:33
scala> result.collect.foreach(item => println(s"${item._1}\t${item._2.mkString(",")}"))
2014-2 64
2015-1 3,4,21,24
2015-2 -43,0,35
2015-3 46,56
2015-4 5
方法3
sortBy函数中的第一个参数可以对排序方式进行重写。为什么sortByKey没有呢?难道只能用默认的排序规则。不是,是有的。其实在OrderedRDDFunctions类中有个变量ordering它是隐形的:private val ordering = implicitly[Ordering[K]]
。他就是默认的排序规则,我们可以对它进行重写,如下:
scala> val b = sc.parallelize(List(3,1,9,12,4))
b: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[38] at parallelize at <console>:12
scala> val c = b.zip(a)
c: org.apache.spark.rdd.RDD[(Int, String)] = ZippedPartitionsRDD2[39] at zip at <console>:16
scala> c.sortByKey().collect
res15: Array[(Int, String)] = Array((1,iteblog), (3,wyp), (4,test), (9,com), (12,397090770))
scala> implicit val sortIntegersByString = new Ordering[Int]{
|override def compare(a: Int, b: Int) =
|a.toString.compare(b.toString)}
sortIntegersByString: Ordering[Int] = \$iwC\$\$iwC\$\$iwC\$\$iwC\$\$iwC\$\$anon\$1@5d533f7a
scala> c.sortByKey().collect
res17: Array[(Int, String)] = Array((1,iteblog), (12,397090770), (3,wyp), (4,test), (9,com))