求topN的多种方法
第一种
package cn.doit.spark.day05.demo02
import cn.doit.SparkUtil
import org.apache.spark.rdd.RDD
/*
http://bigdata.51doit.cn/laozhang
http://bigdata.51doit.cn/laozhang
http://bigdata.51doit.cn/laozhao
http://bigdata.51doit.cn/laozhao
http://bigdata.51doit.cn/laozhao
http://bigdata.51doit.cn/laozhao
http://bigdata.51doit.cn/laozhao
http://bigdata.51doit.cn/laoduan
http://bigdata.51doit.cn/laoduan
http://javaee.51doit.cn/xiaoxu
http://javaee.51doit.cn/xiaoxu
http://javaee.51doit.cn/laoyang
http://javaee.51doit.cn/laoyang
http://javaee.51doit.cn/laoyang
http://bigdata.51doit.cn/laozhao
http://bigdata.51doit.cn/laozhao
http://bigdata.51doit.cn/laozhao
*/
object Teacher {
def main(args: Array[String]): Unit = {
val sc = SparkUtil.createContext()
//指定以后从哪里读取数据创建RDD
val lines = sc.textFile(args(1))
//对数据进行切分
val rdd2: RDD[((String, String), Int)] = lines.map(line => {
val spl = line.split("/")
val subject = spl(2).split("\\.")(0)
val name = spl(3)
((subject, name), 1)
})
//对数据进行聚合
val rdd3: RDD[((String, String), Int)] = rdd2.reduceByKey(_ + _)
//先分组
val rdd4: RDD[(String, Iterable[((String, String), Int)])] = rdd3.groupBy(_._1._1)
//排序
val rdd5: RDD[(String, Seq[(String, Int)])] = rdd4.mapValues(it => {
//将迭代器中的数据都放到内存中
val list: Seq[((String, String), Int)] = it.toList
list.sortBy(-_._2).take(3).map(e=> (e._1._2,e._2))
})
println(rdd5.collect().toBuffer)
sc.stop()
}
}
第二种
package cn.doit.spark.day05.demo02
import cn.doit.SparkUtil
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
object Teacher2 {
def main(args: Array[String]): Unit = {
//调用封装方法,获取SparkContext
val sc: SparkContext = SparkUtil.createContext()
//获取rdd 读取文件
val rdd1: RDD[String] = sc.textFile(args(0))
//切割 取索引 拼接kv
val rdd2: RDD[((String, String), Int)] = rdd1.map(e => {
val fields = e.split("/")
val url = fields(2)