spark-算子
groupBy -分组
package com.core
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo7GroupBy {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("map")
conf.setMaster("local")
val sc = new SparkContext(conf)
//读取学生表的数据
val linesRDD: RDD[String] = sc.textFile("data/students.txt")
//将每行数据按照 , 号划分
val wordsRDD: RDD[Array[String]] = linesRDD.map(lines => lines.split(","))
//取出班级和年龄
val clazzAndAge: RDD[(String, Int)] = wordsRDD.map{
case Array(_,_,age:String,_,clazz:String) =>
(clazz,age.toInt)
}
/**
* GroupBy:按照指定的字段进行分组,返回一个KV格式的RDD
* key是分组的字段,value是一个迭代器
* 迭代器的数据没有完全加载到内存中,迭代器只能迭代一次
*
* groupBy算子需要将相同的key分到同一个分区中,所有会产生shuffle
*
*/
//按照班级分组
val kvRDD: RDD[(String, Iterable[(String, Int)])] = clazzAndAge.groupBy(kv => kv._1)
//计算班级的平均年龄
val aveAgeRDD: RDD[(String, Double)] = kvRDD.map{
case (clazz:String, iter:Iterable[(String, Int)]) =>
val ages: Iterable[Int] = iter.map(kv => kv._2)
val avg_age: Double = ages.sum / ages.size
(clazz,avg_age)
}
aveAgeRDD.foreach(println)
while(true){
}
}
}
groupByKey 按照key来自动分组
package com.core
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo8GroupByKey {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("map")
conf.setMaster("local")
val sc = new SparkContext(conf)
//读取学生表的数据
val studentsRDD: RDD[String] = sc.textFile("data/students.txt")
val splitRDD: RDD[Array[String]] = studentsRDD.map(student => student.split(","))
//取出班级和年龄
val clazzAndAgeRDD: RDD[(String, Int)] = splitRDD.map {
case Array(_, _, age: String, _, clazz: String) =>
(clazz, age.toInt)
}
/**
* GroupByKey:按照Key进行分组
*
*/
val groupByKeyRDD: RDD[(String, Iterable[Int])] = clazzAndAgeRDD.groupByKey()
val avgAgeRDD: RDD[(String, Double)] = groupByKeyRDD.map {
case (clazz: String, ages: Iterable[Int]) =>
val avgAge: Double = ages.sum.toDouble / ages.size
(clazz, avgAge)
}
avgAgeRDD.foreach(println)
while (true) {
}
/**
* groupBy和groupByKey的区别
* 1.代码:groupBy可以在任何RDD上使用,groupByKe只能作用在kv格式的RDD上
* 2. groupByKey之后的rdd的结构相对简单一些
* 3. 性能,groupByKey shuffle 过程需要传输的数据量比groupBy小,性能更高
*
*/
}
}
ReduceByKey 提前在map端进行预聚合(效率更高)
package com.core
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo9ReduceByKey {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("map")
conf.setMaster("local")
val sc = new SparkContext(conf)
val studentsRDD: RDD[String] = sc.textFile("data/students.txt")
val splitRDD: RDD[Array[String]] = studentsRDD.map(students => students.split(","))
//取出班级和年龄
val clazzRDD: RDD[(String, Int)] = splitRDD.map {
case Array(_, _, _, _, clazz: String) =>
(clazz, 1)
}
/**
* reduceByKey:按照key对value做聚合,需要一个集合函数
* reduceKey也会产生一个shuffle
*/
val countRDD: RDD[(String, Int)] = clazzRDD.reduceByKey((x:Int, y:Int) => x + y)
countRDD.foreach(println)
/**
* reduceByKey会在map端预聚合,预聚合之后shuffle过程需要传输的数据量减少,性能更高
* 尽量使用reduceByKey代替groupByKey
* reduceByKey没有groupByKey灵活
* 比如groupByKey可以计算方差,reduceByKey不行
*
*/
countRDD
.groupByKey()
.map(kv => (kv._1,kv._2.sum))
.foreach(println)
while (true) {
}
}
}
ReduceByKey和GorupByKey的区别
Union -连接(前提是字段一样)
package com.core
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo10Union {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("map")
conf.setMaster("local")
val sc = new SparkContext(conf)
val rdd1: RDD[String] = sc.textFile("data/words/1.txt")
val rdd2: RDD[String] = sc.textFile("data/words/2.txt")
/**
* union:合并两个RDD,两个RDD类型的需要一致,=不会对数据进行去重
* union:只是在逻辑层面合并了,物理层面没有合并
*
* 合并之后新的RDD的分区等于前面两个RDD的和
*/
val unionRDD: RDD[String] = rdd1.union(rdd2)
println(s"合并之后的分区数为,${unionRDD.getNumPartitions}")
unionRDD.foreach(println)
}
}
Join(内连接,左右连接,全连接)
package com.core
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo11Join {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("map")
conf.setMaster("local")
val sc = new SparkContext(conf)
//基于集合构建一个RDD,用于测试
val nameRDD: RDD[(String, String)] = sc.parallelize(
List(
("001", "张三"),
("002", "李四"),
("003", "王五"),
("004", "赵六"))
)
val ageRDD: RDD[(String, Int)] = sc.parallelize(
List(
("000", 22),
("001", 23),
("002", 24),
("003", 25)
))
/**
* inner join:内关联,两边都有才能关联的上
*/
val innerJoinRDD: RDD[(String, (String, Int))] = nameRDD.join(ageRDD)
//整理数据
innerJoinRDD
.map{
case(id:String,(name:String,age:Int)) =>
(id,name,age)
}
.foreach(println)
/**
* left join:左关联,以左表为基础,如果右表没数据就补 null
*
* Option: 两个取值,有值或者null,如果没有关联上,就是null
*
*/
val leftJoinRDD: RDD[(String, (String, Option[Int]))] = nameRDD.leftOuterJoin(ageRDD)
//整理数据
leftJoinRDD
.map{
//关联上的情况
case(id:String, (name:String, Some(age)))=>
(id,name,age)
//未关联的情况
case(id:String,(name:String,None)) =>
(id,name,0)
}
.foreach(println)
/**
* full join:全关联,只要有一边有数据,就会出结果,如果另一边没有,就补null
*/
val fullJoinRDD: RDD[(String, (Option[String], Option[Int]))] = nameRDD.fullOuterJoin(ageRDD)
fullJoinRDD
.map{
case(id:String,(Some(name),Some(age))) =>
(id,name,age)
case(id:String,(Some(name),None)) =>
(id,name,0)
case(id:String,(None,Some(age))) =>
(id,"null",age)
}.foreach(println)
}
}
做一个小题目
package com.core
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo12Student {
def main(args: Array[String]): Unit = {
/**
*统计总分年级排名前十学生各科的分数
* 1、计算学生的总分
*/
val conf = new SparkConf()
conf.setAppName("Demo12Student")
conf.setMaster("local")
val sc = new SparkContext(conf)
//1.读取分数
val scoreRDD: RDD[(String, String, Double)] = sc.textFile("data/score.txt") //读取数据
.map(sco => sco.split(",")) //切分数据
.filter(arr => arr.length ==3) //清洗数据
.map{
//整理数据,取出字段
case Array(sid:String,cid:String,sco:String) =>
(sid,cid,sco.toDouble)
}
//2.计算每个学生的总分
val sumScoRDD: RDD[(String, Double)] = scoreRDD
.map{
case (sid:String,_,sco:Double) =>
(sid,sco)
}
.reduceByKey((x:Double,y:Double) => x + y )
//3.按照总分排名,然后取出TopN
val top10_SumSco: Array[(String, Double)] = sumScoRDD
.sortBy(kv => -kv._2)
.take(10)
val top10_all: RDD[(String, String, Double)] = scoreRDD.filter{
case (id:String,_,_) =>
//判断是否在前十的学生中
top10_SumSco.map(kv => kv._1).contains(id)
}
top10_all.foreach(println)
}
}