数据格式为:
http://python.cn/wei
http://python.cn/wei
http://java.cn/zhang
http://java.cn/zhang
package spark
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object GroupTeacher{
def main(args: Array[String]): Unit = {
val topN = 3
val conf = new SparkConf().setAppName("Teacher").setMaster("local[4]")
val sc = new SparkContext(conf)
val lines: RDD[String] = sc.textFile("D:\\code\\ip\\teacher.log")
val sbjectTeacherAndOne: RDD[((String, String), Int)] = lines.map(line => {
val subject = line.split("/")(2).replace(".cn","")
val teacher = line.split("/")(3)
((subject, teacher), 1)
})
val reduced: RDD[((String, String), Int)] = sbjectTeacherAndOne.reduceByKey(_+_)//聚合,将学科和老师联合当做key
val grouped: RDD[(String, Iterable[((String, String), Int)])] = reduced.groupBy(_._1._1)//按学科进行分组
//scala的集合排序是在内存中进行的,但是内存有可能不够用,可以替换使用RDD的sorted
val sorted = grouped.mapValues(_.toList.sortBy(_._2).reverse.take(topN))//经过分组后,一个分区内可能有多个学科的数据,每台机器上都计算一个学科的数据所以可以调用scala的方法
val r: Array[(String, List[((String, String), Int)])] = sorted.collect()
println(r.toBuffer)
sc.stop()
}
}
优化:在每一台机器上进行排序
package spark
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object GroupTeacher2 {
def main(args: Array[String]): Unit = {
val topN = 3
val subjects = Array("python", "java", "php")
val conf = new SparkConf().setAppName("GroupFavTeacher2").setMaster("local[4]")
val sc = new SparkContext(conf)
sc.setCheckpointDir("D:\\code\\ip\\ck")
val lines: RDD[String] = sc.textFile("D:\\code\\ip\\teacher.log")
val sbjectTeacherAndOne: RDD[((String, String), Int)] = lines.map(line => {
val subject = line.split("/")(2).replace(".cn","")
val teacher = line.split("/")(3)
((subject, teacher), 1)
})
val reduced: RDD[((String, String), Int)] = sbjectTeacherAndOne.reduceByKey(_+_)
//val cached = reduced.cache()//cache到内存(标记为Cache的RDD以后被反复使用,才使用cache)
reduced.checkpoint()
//scala的集合排序是在内存中进行的,但是内存有可能不够用,可以调用RDD的sortby方法,内存+磁盘进行排序
for (sb <- subjects) {
val filtered: RDD[((String, String), Int)] = reduced.filter(_._1._1 == sb)//过滤一个学科的数据
val favTeacher = filtered.sortBy(_._2, false).take(topN)//调用RDD的sortBy方法,(take是一个action,会触发任务提交)
println(favTeacher.toBuffer)
}
sc.stop()
}
}
优化:加上了分区器parpartitioner,避免数据倾斜
package cn.edu360.day3
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
import scala.collection.mutable
object GroupTeacher3 {
def main(args: Array[String]): Unit = {
val topN = 3
val conf = new SparkConf().setAppName("GroupTeacher").setMaster("local[4]")
val sc = new SparkContext(conf)
val lines: RDD[String] = sc.textFile("D:\\code\\ip\\teacher.log")
val sbjectTeacherAndOne: RDD[((String, String), Int)] = lines.map(line => {
val subject = line.split("/")(2).replace(".cn","")
val teacher = line.split("/")(3)
((subject, teacher), 1)
})
val reduced: RDD[((String, String), Int)] = sbjectTeacherAndOne.reduceByKey(_+_)
val subjects: Array[String] = reduced.map(_._1._1).distinct().collect()
val sbPatitioner = new SubjectParitioner(subjects);//自定义一个分区器,并且按照指定的分区器进行分区
val partitioned: RDD[((String, String), Int)] = reduced.partitionBy(sbPatitioner)//partitionBy按照指定的分区规则进行分区
val sorted: RDD[((String, String), Int)] = partitioned.mapPartitions(it => {
it.toList.sortBy(_._2).reverse.take(topN).iterator//将迭代器转换成list,然后排序,在转换成迭代器返回
})
val r: Array[((String, String), Int)] = sorted.collect()
println(r.toBuffer)
sc.stop()
}
}
//自定义分区器
class SubjectParitioner(sbs: Array[String]) extends Partitioner {
val rules = new mutable.HashMap[String, Int]()
var i = 0
for(sb <- sbs) {
//rules(sb) = i
rules.put(sb, i)
i += 1
}
override def numPartitions: Int = sbs.length//返回分区的数量(下一个RDD有多少分区)
//根据传入的key计算分区标号
//key是一个元组(String, String)
override def getPartition(key: Any): Int = {
val subject = key.asInstanceOf[(String, String)]._1//获取学科名称
rules(subject)//根据规则计算分区编号
}
}