Spark Group

数据格式为:

http://python.cn/wei
http://python.cn/wei
http://java.cn/zhang
http://java.cn/zhang
package spark
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object GroupTeacher{
  def main(args: Array[String]): Unit = {
    val topN = 3
    val conf = new SparkConf().setAppName("Teacher").setMaster("local[4]")
    val sc = new SparkContext(conf)
    val lines: RDD[String] = sc.textFile("D:\\code\\ip\\teacher.log")
    val sbjectTeacherAndOne: RDD[((String, String), Int)] = lines.map(line => {
      val subject = line.split("/")(2).replace(".cn","")
      val teacher = line.split("/")(3)
      ((subject, teacher), 1)
    })
    val reduced: RDD[((String, String), Int)] = sbjectTeacherAndOne.reduceByKey(_+_)//聚合,将学科和老师联合当做key
    val grouped: RDD[(String, Iterable[((String, String), Int)])] = reduced.groupBy(_._1._1)//按学科进行分组
    //scala的集合排序是在内存中进行的,但是内存有可能不够用,可以替换使用RDD的sorted
    val sorted = grouped.mapValues(_.toList.sortBy(_._2).reverse.take(topN))//经过分组后,一个分区内可能有多个学科的数据,每台机器上都计算一个学科的数据所以可以调用scala的方法
    val r: Array[(String, List[((String, String), Int)])] = sorted.collect()
    println(r.toBuffer)
    sc.stop()
  }
}

优化:在每一台机器上进行排序

package spark
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object GroupTeacher2 {
  def main(args: Array[String]): Unit = {
    val topN = 3
    val subjects = Array("python", "java", "php")
    val conf = new SparkConf().setAppName("GroupFavTeacher2").setMaster("local[4]")
    val sc = new SparkContext(conf)
    sc.setCheckpointDir("D:\\code\\ip\\ck")
    val lines: RDD[String] = sc.textFile("D:\\code\\ip\\teacher.log")
    val sbjectTeacherAndOne: RDD[((String, String), Int)] = lines.map(line => {
      val subject = line.split("/")(2).replace(".cn","")
      val teacher = line.split("/")(3)
      ((subject, teacher), 1)
    })
    val reduced: RDD[((String, String), Int)] = sbjectTeacherAndOne.reduceByKey(_+_)
    //val cached = reduced.cache()//cache到内存(标记为Cache的RDD以后被反复使用,才使用cache)
    reduced.checkpoint()
    //scala的集合排序是在内存中进行的,但是内存有可能不够用,可以调用RDD的sortby方法,内存+磁盘进行排序
    for (sb <- subjects) {
      val filtered: RDD[((String, String), Int)] = reduced.filter(_._1._1 == sb)//过滤一个学科的数据
      val favTeacher = filtered.sortBy(_._2, false).take(topN)//调用RDD的sortBy方法,(take是一个action,会触发任务提交)
      println(favTeacher.toBuffer)
    }
    sc.stop()
  }
}


优化:加上了分区器parpartitioner,避免数据倾斜

package cn.edu360.day3
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partitioner, SparkConf, SparkContext}

import scala.collection.mutable
object GroupTeacher3 {

  def main(args: Array[String]): Unit = {
    val topN = 3
    val conf = new SparkConf().setAppName("GroupTeacher").setMaster("local[4]")
    val sc = new SparkContext(conf)
    val lines: RDD[String] = sc.textFile("D:\\code\\ip\\teacher.log")
    val sbjectTeacherAndOne: RDD[((String, String), Int)] = lines.map(line => {
      val subject = line.split("/")(2).replace(".cn","")
      val teacher = line.split("/")(3)
      ((subject, teacher), 1)
    })
    val reduced: RDD[((String, String), Int)] = sbjectTeacherAndOne.reduceByKey(_+_)
    val subjects: Array[String] = reduced.map(_._1._1).distinct().collect()
    val sbPatitioner = new SubjectParitioner(subjects);//自定义一个分区器,并且按照指定的分区器进行分区
    val partitioned: RDD[((String, String), Int)] = reduced.partitionBy(sbPatitioner)//partitionBy按照指定的分区规则进行分区
    val sorted: RDD[((String, String), Int)] = partitioned.mapPartitions(it => {
      it.toList.sortBy(_._2).reverse.take(topN).iterator//将迭代器转换成list,然后排序,在转换成迭代器返回
    })
    val r: Array[((String, String), Int)] = sorted.collect()
    println(r.toBuffer)
    sc.stop()
  }
}

//自定义分区器
class SubjectParitioner(sbs: Array[String]) extends Partitioner {
  val rules = new mutable.HashMap[String, Int]()
  var i = 0
  for(sb <- sbs) {
    //rules(sb) = i
    rules.put(sb, i)
    i += 1
  }
  override def numPartitions: Int = sbs.length//返回分区的数量(下一个RDD有多少分区)
  //根据传入的key计算分区标号
  //key是一个元组(String, String)
  override def getPartition(key: Any): Int = {
    val subject = key.asInstanceOf[(String, String)]._1//获取学科名称
    rules(subject)//根据规则计算分区编号
  }
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值