package com.shujia.core
import com.shujia.core.Demo10Join.Student
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object Demo014MapValues {
def main(args: Array[String]): Unit = {
//1.统计人数
//创建Spark Context
val conf: SparkConf = new SparkConf()
conf.setAppName("Demo014MapValues")
conf.setMaster("local")
val sc: SparkContext = new SparkContext(conf)
//读取学生数据及分数数据 并将每一行数据转化为样例类对象
val stuRDD: RDD[(String,String)] = sc
.textFile("Spark/data/students.txt")
.map(line => {
val splits: Array[String] = line.split(",")
val id: String = splits(0)
val name: String = splits(1)
val age: Int = splits(2).toInt
val gender: String = splits(3)
val clazz: String = splits(4)
(id,name)
})
/**
* mapValues 转换算子
* 作用在KV格式的RDD上 可以遍历所有的Values
*/
stuRDD
.mapValues(name=>{
println(name)
}).foreach(println)
}
}
package com.shujia.core
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object Demo15AggregateByKey {
def main(args: Array[String]): Unit = {
//创建Spark Context
val conf: SparkConf = new SparkConf()
conf.setAppName("Demo15AggregateByKey")
conf.setMaster("local")
val sc: SparkContext = new SparkContext(conf)
//读取学生数据及分数数据 并将每一行数据转化为样例类对象
val genderRDD: RDD[(String,Int)] = sc
.textFile("Spark/data/students.txt")
.map(line => {
val splits: Array[String] = line.split(",")
val gender: String = splits(3)
(gender,1)
})
//统计性别人数
genderRDD
.reduceByKey(_+_) //有限制 不能做 平均值
.foreach(println)
/**
* aggregateByKey 转换算子
* zeroValue 初始值
* seqOp 聚合方法 map端聚合
* combOp 聚合方法 reduce端聚合
* 可以将多个聚合操作放在一块实现
*
* 主要是为了解决reduceByKey无法直接统计平均值
*/
genderRDD
.aggregateByKey((0))(
(u1:Int,u2:Int)=>{
u1+u2 //处理分区内数据
},(mapU1:Int,mapU2:Int)=>{
mapU1+mapU2 //处理分区间数据
}
).foreach(println)
//统计每个班级的平均年龄
//无法使用reduceByKey
val clazzAgeRDD: RDD[(String,Int)] = sc
.textFile("Spark/data/students.txt")
.map(line => {
val splits: Array[String] = line.split(",")
val age: Int = splits(2).toInt
val clazz: String = splits(4)
(clazz,age)
})
clazzAgeRDD
.aggregateByKey((0,0))( //初始化两个值 第一个用于保存age的累加和 第二个用于统计人数
(u1:(Int,Int),age)=>{
val mapAgeSum: Int = u1._1+age
val mapAgeCnt: Int = u1._2 + 1
(mapAgeSum,mapAgeCnt)
}
,(mapU1:(Int,Int),mapU2:(Int,Int))=>{
val sumAge: Int = mapU1._1+mapU2._1
val sumCnt: Int = mapU1._2 + mapU2._2
(sumAge,sumCnt)
}
)
.map{
case (clazz:String,(sumAge:Int,sumCnt:Int))=>
s"$clazz,${sumAge/sumCnt.toDouble}"
}.foreach(println)
}
}
(男,507)
(女,493)
(男,507)
(女,493)理科二班,22.556962025316455
文科三班,22.680851063829788
理科四班,22.63736263736264
理科一班,22.333333333333332
文科五班,22.30952380952381
文科一班,22.416666666666668
文科四班,22.506172839506174
理科六班,22.48913043478261
理科三班,22.676470588235293
文科六班,22.60576923076923
理科五班,22.642857142857142
文科二班,22.379310344827587