表:
http://bigdata.edu360.cn/laozhang http://bigdata.edu360.cn/laozhang |
一,求学生最喜欢的老师的topN
package
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf,SparkContext}
object FavTeacher {
defmain(args: Array[String]): Unit = {
//setMaster("local[*]") 指定master的本地模式,方便调试,[*]启动多个线程
val conf = new SparkConf().setAppName("FavTeacher").setMaster("local[*]")
valsc = new SparkContext(conf)
//指定以后从哪里读取数据
val lines: RDD[String] = sc.textFile(args(0))
//将url切分,取出老师,将老师名称和一放到一个元组中
val teacherAndOne:RDD[(String, Int)] = lines.map(line => {
val index =line.lastIndexOf("/")
val teacher = line.substring(index + 1)
(teacher, 1)
})
//聚合
val reduced = teacherAndOne.reduceByKey(_+_)
//排序
val sorted = reduced.sortBy(_._2, false)
//取出前两名
val top2: Array[(String, Int)] = sorted.take(2)
println(top2.toBuffer)
sc.stop()
}
}
二、求学生最喜欢的每门功课的topN
package
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf,SparkContext}
object FavTeacher1 {
defmain(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("FavTeacher").setMaster("local[*]")
valsc = new SparkContext(conf)
val lines: RDD[String] = sc.textFile(args(0))
//将url切分,取出老师,将老师名称和一放到一个元组中
val subjectTeacherAndOne: RDD[((String,String), Int)] = lines.map(line => {
val index =line.lastIndexOf("/")
val subUrl = line.substring(0, index)
val subIndex =subUrl.lastIndexOf("/")
val subject = subUrl.substring(subIndex +1)
val teacher = line.substring(index + 1)
((subject, teacher), 1)
})
val reduced:RDD[((String, String), Int)] =subjectTeacherAndOne.reduceByKey(_+_)
val grouped: RDD[(String, Iterable[((String, String), Int)])] =reduced.groupBy(_._1._1)
//二次排序
// val result: RDD[(String, List[((String, String), Int)])] =grouped.mapValues(it => {
// it.toList.sortBy(_._2).reverse.take(2)
// })
//获取迭代器,一个迭代器就一个学科老师信息的集合
val values: RDD[Iterable[((String, String), Int)]] = grouped.values
//map一次就拿到一个学科
val result = values.map(it => {
it.toList.sortBy(_._2).reverse.take(1)(0)
})
val arr = result.collect()
println(arr.toBuffer)
sc.stop()
}
}