方法1: groupBy
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount_01_groupBy {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
val sc = new SparkContext(conf)
//准备数据
val rdd: RDD[(String, Int)] = sc.makeRDD(
List(
("a", 1), ("a", 2), ("b", 3),
("b", 4), ("b", 5), ("a", 6)
), 2
)
rdd.groupBy(_._1).mapValues(_.map(_._2).sum).collect().foreach(println)
sc.stop()
}
}
方式2: groupByKey
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount_02_groupByKey {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
val sc = new SparkContext(conf)
//准备数据
val rdd: RDD[(String, Int)] = sc.makeRDD(
List(
("a", 1), ("a", 2), ("b", 3),
("b", 4), ("b", 5), ("a", 6)
), 2
)
rdd.groupByKey().mapValues(_.sum).collect().foreach(println)
sc.stop()
}
}
方式3: reduceByKey
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount_03_reduceByKey {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
val sc = new SparkContext(conf)
//准备数据
val rdd: RDD[(String, Int)] = sc.makeRDD(
List(
("a", 1), ("a", 2), ("b", 3),
("b", 4), ("b", 5), ("a", 6)
), 2
)
rdd.reduceByKey(_+_).collect().foreach(println)
sc.stop()
}
}
方式4:aggregateByKey
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount_04_aggregateByKey {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
val sc = new SparkContext(conf)
//准备数据
val rdd: RDD[(String, Int)] = sc.makeRDD(
List(
("a", 1), ("a", 2), ("b", 3),
("b", 4), ("b", 5), ("a", 6)
), 2
)
rdd.aggregateByKey(0)(_+_,_+_).collect().foreach(println)
sc.stop()
}
}
方式5:foldByKey
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount_05_foldByKey {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
val sc = new SparkContext(conf)
//准备数据
val rdd: RDD[(String, Int)] = sc.makeRDD(
List(
("a", 1), ("a", 2), ("b", 3),
("b", 4), ("b", 5), ("a", 6)
), 2
)
rdd.foldByKey(0)(_+_).collect().foreach(println)
sc.stop()
}
}
方式6:combineByKey
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount_06_combineByKey {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
val sc = new SparkContext(conf)
//准备数据
val rdd: RDD[(String, Int)] = sc.makeRDD(
List(
("a", 1), ("a", 2), ("b", 3),
("b", 4), ("b", 5), ("a", 6)
), 2
)
rdd.combineByKey(
v=>v,
(t1:Int,v) => t1 + v,
(t1:Int,t2:Int) => t1 + t2
).collect().foreach(println)
sc.stop()
}
}
方式7:countByKey
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount_07_countByKey {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
val sc = new SparkContext(conf)
//准备数据
val rdd: RDD[(String, Int)] = sc.makeRDD(
List(
("a", 1), ("a", 2), ("b", 3),
("b", 4), ("b", 5), ("a", 6)
), 2
)
rdd.map(data => {
(data._1 + " ") * data._2
}).flatMap(_.split(" ")).map((_,1)).countByKey().foreach(println)
sc.stop()
}
}
方式8:countByValue
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount_08_countByValue {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
val sc = new SparkContext(conf)
//准备数据
val rdd: RDD[(String, Int)] = sc.makeRDD(
List(
("a", 1), ("a", 2), ("b", 3),
("b", 4), ("b", 5), ("a", 6)
), 2
)
rdd.map(data => {
(data._1 + " ") * data._2
}).flatMap(_.split(" ")).countByValue().foreach(println)
sc.stop()
}
}
方式9:aggregate
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
object WordCount_09_aggregate {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
val sc = new SparkContext(conf)
//准备数据
val rdd: RDD[(String, Int)] = sc.makeRDD(
List(
("a", 1), ("a", 2), ("b", 3),
("b", 4), ("b", 5), ("a", 6)
), 2
)
rdd.map(data => mutable.Map(data._1 -> data._2)).aggregate(mutable.Map[String,Int]())(
(map1:mutable.Map[String,Int], map2:mutable.Map[String,Int]) => {
map1.foldLeft(map2)(
(innermap,kv) => {
innermap(kv._1) = innermap.getOrElse(kv._1,0) + kv._2
innermap
}
)
},
(map1:mutable.Map[String,Int],map2:mutable.Map[String,Int]) =>{
map1.foldLeft(map2)(
(innermap,kv)=>{
innermap(kv._1) = innermap.getOrElse(kv._1,0) + kv._2
innermap
}
)
}
).foreach(println)
sc.stop()
}
}
方式10:fold
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
object WordCount_10_fold {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
val sc = new SparkContext(conf)
//准备数据
val rdd: RDD[(String, Int)] = sc.makeRDD(
List(
("a", 1), ("a", 2), ("b", 3),
("b", 4), ("b", 5), ("a", 6)
), 2
)
rdd.map(data => mutable.Map(data._1 -> data._2)).fold(mutable.Map[String,Int]())(
(map1:mutable.Map[String,Int],map2:mutable.Map[String,Int])=>{
map1.foldLeft(map2)(
(resmap,kv) => {
resmap(kv._1) = resmap.getOrElse(kv._1,0) + kv._2
resmap
}
)
}
).foreach(println)
sc.stop()
}
}
方式11: reduce+foldleft
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
object WordCount_11_reduce_foldleft {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
val sc = new SparkContext(conf)
//准备数据
val rdd: RDD[(String, Int)] = sc.makeRDD(
List(
("a", 1), ("a", 2), ("b", 3),
("b", 4), ("b", 5), ("a", 6)
), 2
)
rdd.map(data => mutable.Map(data._1 -> data._2)).reduce((map1,map2)=>{
map1.foldLeft(map2)(
(resmap,kv) => {
resmap(kv._1) = resmap.getOrElse(kv._1, 0) + kv._2
resmap
}
)
}).foreach(println)
sc.stop()
}
}