主要步骤:
- 加载集合生成RDD(textFile)
- 对RDD进行转换,将要排序的属性进行分离,生成新的RDD(map)
- 对键进行分组,并在分组内基于第二步分离出的属性进行排序,并取排序结果的 top N (groupByKey,map)
package rddDemo.examples
import org.apache.spark.{SparkConf, SparkContext}
/**
* 数据样本如下:
anhui hefei 20
jiangsu nanjing 90
shandong jinan 100
* Created by asus on 2018/7/29.
*/
object TopNWithGroupDemo {
def main(args : Array[String]) : Unit = {
val conf = new SparkConf()
conf.setAppName("TopNDemo")
conf.setMaster("local[2]")
System.setProperty("hadoop.home.dir" , "E:\\hadoop-2.6.0")
val sc = new SparkContext(conf)
val filepath = "hdfs://192.168.204.130:9000/log_file/sale_data.txt"
// 加载 HDFS 文件构造RDD,并使用 filter 算子去掉空行
val saleRDD = sc.textFile(filepath , 2).filter(line => line.trim().length > 0)
// 对RDD进行转换,将要排序的属性作为键
val saleWithProvinceRDD = saleRDD.map{
line => {
val lineInfo = line.split("\\s+")
val province = lineInfo(0)
val saleCount = lineInfo(2).toInt
new Tuple2[String , Tuple2[Integer , String]](province , new Tuple2[Integer , String](saleCount , line))
}
}
// saleWithProvinceRDD.foreach(println)
// 进行分组
val saleGroupByProvinceRDD = saleWithProvinceRDD.groupByKey()
// 在分组内进行排序,取分组内的 top N
val saleGroupByProvinceTopNRDD = saleGroupByProvinceRDD.map {
tuple => {
// 取出键
val province = tuple._1
// 取出键所对应的值
val saleInfo = tuple._2
var saleInfoList = List[(Integer , String)]()
for(s <- saleInfo) {
val saleCount = s._1
val info = s._2
saleInfoList = saleInfoList.::((saleCount , info))
}
// 将键所对应的值进行自定义排序
saleInfoList = saleInfoList.sortWith{
(s1 , s2) => {
s1._1 > s2._1
}
}
saleInfoList = saleInfoList.take(3)
new Tuple2[String , Iterable[(Integer , String)]](province , saleInfoList)
}
}
saleGroupByProvinceTopNRDD.foreach(println)
sc.stop()
}
}