/**
* 键值对RDD的行动算子
*/
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setAppName("action").setMaster("local")
val sc: SparkContext = new SparkContext(sparkConf)
countOper(sc)
saveAsFileOper(sc)
reduceByKeyOper(sc)
sc.stop()
}
def countOper(sc:SparkContext):Unit = {
val rdd: RDD[(String, Int)] = sc.makeRDD(Array(("cy", 32), ("hr", 23), ("hr", 24), ("mm", 46), ("bb", 51),("cy",33)))
val count:Long = rdd.count()
println(count)
val data:scala.collection.Map[String,Long] = rdd.countByKey()
println(data)
}
def saveAsFileOper(sc:SparkContext):Unit = {
val rdd: RDD[(String, Int)] = sc.makeRDD(Array(("cy", 32), ("hr", 23), ("hr", 24), ("mm", 46), ("bb", 51),("cy",33)))
rdd.saveAsTextFile("hdfs://node1:9000/spark-study/a")
rdd.saveAsSequenceFile("hdfs://node1:9000/spark-study/b")
}
/**
* reduceByKey——行动算子,对相同key值的value数据通过func函数进行聚合操作(总和、最大值。。。)
* 对RDD中的每个键值对元素按照key对value聚合(对所有key相同的元素的value进行累加
* reduceByKey在map端自带Combiner,大大减少了shuffle阶段的数据传输量
* def reduceByKey(partitioner: Partitioner, func: (V, V) => V): RDD[(K, V)] = {}
*/
def reduceByKeyOper(sc:SparkContext):Unit = {
// rdd中存放的是一行又一行的数据
val rdd:RDD[String] = sc.textFile("hdfs://node1:9000/wc.txt")
// rdd1中存放的是一个又一个的单词
val rdd1:RDD[String] = rdd.flatMap((line: String) => {
line.split(" ")
})
// rdd2中存放的是键值对类型的数据
val rdd2:RDD[(String,Int)] = rdd1.map((word: String) => {
(word, 1)
})
println(rdd2.collect().mkString("; "))
// 按相同的key进行聚合
val rdd3:RDD[(String,Int)] = rdd2.reduceByKey((a: Int, b: Int) => {
a + b
})
rdd3.foreach(println(_))
}
}
Spark键值对中常用的行动算子
于 2022-08-26 17:41:35 首次发布