spark core几种实现word count的方式(基于spark3.1.2和scala2.12.13):
import org.apache.spark.{SparkConf, SparkContext}
object WordCount {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[4]").setAppName("WordCount")
val sc = new SparkContext(conf)
val rdd = sc.makeRDD(Seq("Apache Spark lightning-fast unified analytics engine",
"Apache Spark is a unified analytics engine for large-scale data processing"))
.flatMap(_.split(" ")).map((_, 1))
println("==========1. reduceByKey==========")
val words = rdd.reduceByKey(_ + _)
words.collect().foreach(println)
val groupRDD = rdd.groupByKey().mapValues(_.sum)
println("==========2. groupByKey==========")
groupRDD.collect().foreach(println)
val aggRDD = rdd.aggregateByKey(0)(math.max, _ + _)
println("==========3. aggregateByKey1==========")
aggRDD.collect().foreach(println)
println("==========4. aggregateByKey2==========")
val aggRDD2 = rdd.aggregateByKey(0)(_ + _, _ + _)
aggRDD2.collect().foreach(println)
val foldRDD = rdd.foldByKey(0)(_ + _)
println("==========5. foldByKey==========")
foldRDD.collect().foreach(println)
val combineRDD = rdd.combineByKey((_, 1),
(acc: (Int, Int), v) => (acc._1 + v, acc._2 + 1),
(acc1: (Int, Int), acc2: (Int, Int)) => (acc1._1 + acc2._1, acc1._2 + acc2._2)
).mapPartitions(prt => {
prt.map(f => (f._1, f._2._2))
})
println("==========6. combineByKey==========")
combineRDD.collect().foreach(println)
sc.stop()
}
}