spark outline
Spark reduceByKey 功能
按照相同的key,对value进行聚合
案例演示
有List((“a”, 1), (“b”, 3), (“a”, 5), (“b”, 2))
(“b”, 3):表示b这个单词出些3次
需求:统计各个单词出现总次数,并输出到控制台(a,6)
(b,5)
package com.xcu.bigdata.spark.core.pg02_rdd.pg022_rdd_transform
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* @Desc : 按照相同的key,对value进行聚合
*/
object Spark13_ReduceByKey {
def main(args: Array[String]): Unit = {
//创建配置文件
val conf: SparkConf = new SparkConf().setAppName("Spark13_ReduceByKey").setMaster("local[*]")
//创建SparkContext,该对象是提交的入口
val sc = new SparkContext(conf)
//创建RDD
val rdd: RDD[(String, Int)] = sc.makeRDD(List(("a", 1), ("b", 3), ("a", 5), ("b", 2)))
//按照相同的key,进行聚合
val resRDD: RDD[(String, Int)] = rdd.reduceByKey((x: Int, y: Int) => {
x + y
})
//打印输出
resRDD.collect().foreach(println)
//释放资源
sc.stop()
}
}
小练习:用reduceBykey求平均成绩
package com.xcu.bigdata.spark.core.pg02_rdd.pg022_rdd_transform
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* @Desc : 求平均值
*/
object Spark13_ReduceByKeyPractice {
def main(args: Array[String]): Unit = {
//创建配置文件
val conf: SparkConf = new SparkConf().setAppName("Spark13_ReduceByKeyPractice").setMaster("local[*]")
//创建SparkContext,该对象是提交的入口
val sc = new SparkContext(conf)
//创建RDD
val rdd: RDD[(String, Int)] = sc.makeRDD(List(("zs", 90), ("lisi", 60), ("zs", 96), ("lisi", 62), ("zs", 100), ("lisi", 50)))
//转换结构
val mapRDD: RDD[(String, (Int, Int))] = rdd.map {
case (name, score) => {
(name, (score, 1))
}
}
//聚合
val reduceByKeyRDD: RDD[(String, (Int, Int))] = mapRDD.reduceByKey(
(t1: (Int, Int), t2: (Int, Int)) => {
(t1._1 + t2._1, t1._2 + t2._2)
}
)
//求平均值
val resRDD: RDD[(String, Int)] = reduceByKeyRDD.map {
case (name, (score, count)) => {
(name, score / count)
}
}
//打印输出
resRDD.collect().foreach(println)
//释放资源
sc.stop()
}
}
结果:
(zs,95)
(lisi,57)