sortBy函数源码:接收三个参数,第一个参数必须,第二个和第三个参数非必要
def sortBy[K](
f: (T) => K,
ascending: Boolean = true,
numPartitions: Int = this.partitions.length)
(implicit ord: Ordering[K], ctag: ClassTag[K]): RDD[T] = withScope {
this.keyBy[K](f)
.sortByKey(ascending, numPartitions)
.values
}
1、例子1:按照value进行降序排序
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
package
com.test.spark
import
org.apache.spark.{SparkConf, SparkContext}
/**
* @author admin
* SortBy是SortByKey的增强版
* 按照value进行排序
*/
object
SparkSortByApplication {
def
main(args
:
Array[String])
:
Unit
=
{
val
conf
=
new
SparkConf().setAppName(
"SortSecond"
).setMaster(
"local[1]"
)
val
sc
=
new
SparkContext(conf)
val
datas
=
sc.parallelize(Array((
"cc"
,
12
),(
"bb"
,
32
),(
"cc"
,
22
),(
"aa"
,
18
),(
"bb"
,
16
),(
"dd"
,
16
),(
"ee"
,
54
),(
"cc"
,
1
),(
"ff"
,
13
),(
"gg"
,
32
),(
"bb"
,
4
)))
// 统计key出现的次数
val
counts
=
datas.reduceByKey(
_
+
_
)
// 按照value进行降序排序
val
sorts
=
counts.sortBy(
_
.
_
2
,
false
)
sorts.collect().foreach(println)
<
br
>
sc.stop()
}
}
|
输出结果:
(ee,54)
(bb,52)
(cc,35)
(gg,32)
(aa,18)
(dd,16)
(ff,13)
2、例子2:先按照第一个元素升序排序,如果第一个元素相同,再进行第三个元素进行升序排序
1
2
3
4
5
6
7
8
9
10
11
|
package
com.sudiyi.spark
import
org.apache.spark.{SparkConf, SparkContext}
/**
* @author xubiao
* SortBy是SortByKey的增强版
* 先按照第一个,再按照第三个元素进行升序排序
*/
object
SparkSortByApplication {
def
main(args
:
Array[String])
:
Unit
=
{
|
1
2
3
4
5
6
7
8
9
10
11
12
|
val
conf
=
new
SparkConf().setAppName(
"SortSecond"
).setMaster(
"local[1]"
)
val
sc
=
new
SparkContext(conf)
val
arr
=
Array((
1
,
6
,
3
), (
2
,
3
,
3
), (
1
,
1
,
2
), (
1
,
3
,
5
), (
2
,
1
,
2
))
val
datas
2
=
sc.parallelize(arr)
val
sorts
2
=
datas
2
.sortBy(e
=
> (e.
_
1
,e.
_
2
))
sorts
2
.collect().foreach(println)
sc.stop()
}
}
|
输出结果:
(1,1,2)
(1,3,5)
(1,6,3)
(2,1,2)
(2,3,3)