package com.lyzx.day35
import org.apache.spark.{SparkConf, SparkContext}
class T2 {
def f1(): Unit ={
val data = List(2,5,8,1,2,6,9,4,3,5)
val res = data.par.aggregate((0,0))(
// seqOp
(acc, number) => (acc._1+number, acc._2+1),
// combOp
(par1, par2) => (par1._1+par2._1, par1._2+par2._2)
)
println(res)
}
def f2(sc:SparkContext): Unit ={
val rdd = sc.parallelize(List("aa","bb","cc"))
//f(V,V)=>V
val seqOp = (a:String,b:String)=>{
println("a="+a+" b="+b)
a+"-"+b
}
//f(V)=>W
val combOp = (a:String,b:String)=>{
println("a:"+a+" b:"+b)
a+":"+b
}
val res = rdd.aggregate("oo")(seqOp,combOp)
println("res:"+res)
}
/*
aggregate 函数详解
*/
def f3(sc:SparkContext): Unit ={
val rdd = sc.parallelize(1 to 10,2)
rdd.mapPartitionsWithIndex((idx,itr)=>{
while(itr.hasNext){
println("["+idx+"] ::: "+itr.next())
}
for(v <- itr) yield v
}).collect()
//这个函数相当于局部聚合,在shuffle的map端发生
//这个程序中把局部的数据加起来,数据分别在两个Partition上面
//p1[1,2,3,4,5] p2[6,7,8,9,10]
//所以p1聚合的结果是15 p2->40
val x = (x:Int,y:Int)=>{
println("1>x="+x+",y="+y)
x+y
}
//这个函数相当于把局部聚合的结果再次聚合
//在reduce端对上面局部聚合的结果[15,40]再应用下面的函数
val y = (x:Int,y:Int)=>{
println("2>x="+x+",y="+y)
x+y
}
val r = rdd.aggregate(0)(x,y)
println("RDD:"+r)
}
/**
* aggregateByKey
* 应用于shuffle时map端和reduce端执行逻辑不一致时
* @param sc
*/
def f4(sc:SparkContext): Unit ={
val data = List((1,3),(1,2),(1,4),(2,3),(3,6),(3,8))
val rdd = sc.parallelize(data,2)
rdd.mapPartitionsWithIndex((index,itr)=>{
while(itr.hasNext){
println("["+index+"] "+itr.next())
}
for(v <- itr) yield v
}).collect()
//shuffle的map端 局部聚合函数作用于每个key相同的元素上 求出每个key对应的值最大的 (1,(2,3,4)) =>(1,4)
val x = (x:Int,y:Int)=>{println("1>x="+x+",y="+y);Math.max(x,y)}
//在reduce端做聚合 由于每个键对应的值都是该键对应的最大值所以这个函数不会执行
val y = (x:Int,y:Int)=>{println("2>x="+x+",y="+y);x+y}
rdd.aggregateByKey(0)(x,y)
.collect()
.foreach(println)
}
}
object T2{
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("day35").setMaster("local")
val sc = new SparkContext(conf)
val t = new T2
// t.f3(sc)
t.f4(sc)
sc.stop()
}
}
《深入理解Spark》之aggregate和aggregateByKey
最新推荐文章于 2023-08-02 02:59:28 发布