输入数据:
aa 78
bb 98
aa 80
cc 98
aa 69
cc 87
bb 97
cc 86
aa 97
bb 78
bb 34
cc 85
bb 92
cc 72
bb 32
bb 23
期望输出:
(aa,List(78, 80, 97))
(bb,List(92, 97, 98))
(cc,List(86, 87, 98))
或
(aa,ArrayBuffer(78, 80, 97))
(bb,ArrayBuffer(92, 97, 98))
(cc,ArrayBuffer(86, 87, 98))
scala代码:
object GroupSortedTopN {
def main(args: Array[String]): Unit = {
val k = 3
val conf = new SparkConf().setMaster("local").setAppName("GroupSortedTopnz")
val sc = new SparkContext(conf)
val initRdd: RDD[(String, Int)] = sc.textFile("groupbykey.txt")
.map(_.split(" "))
.filter(_.length == 2)
.map(arr => (arr(0), arr(1).toInt))
initRdd.cache()
val resultRdd: RDD[(String, List[Int])] = initRdd.groupByKey()
.map {
case (item1, iter) => {
val topKitem2 =iter
.toList
.sorted
.takeRight(k)
// .map((item1, _))
(item1, topKitem2)
}
}
val resultRdd2: RDD[(String, ArrayBuffer[Int])] = initRdd.aggregateByKey(ArrayBuffer[Int]())(
(u, v) => {
u += v
u.sorted.takeRight(k)
},
(u1, u2) => {
u1 ++= u2
u1.sorted.takeRight(k)
}
)
initRdd.unpersist()
resultRdd.cache()
resultRdd.foreachPartition(_.foreach(println))
resultRdd.saveAsTextFile("groupByKeyResult")
resultRdd2.cache()
resultRdd2.foreachPartition(_.foreach(println))
resultRdd2.saveAsTextFile("groupByKeyResult2")
resultRdd.unpersist()
}
}