1、aggregateByKey
package core.tc.spark.wordcount
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object wordCount1aggregateByKey {
def main(args: Array[String]): Unit = {
//aggregateByKey
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("File - RDD")
val sc = new SparkContext(sparkConf)
val rdd = sc.makeRDD(
List(("a", 1), ("b", 2), ("a", 1), ("b", 2), ("a", 1), ("b", 2)), 2
)
val aggregateByKeyRDD: RDD[(String, Int)] = rdd.aggregateByKey(0)(_ + _, _ + _)
println(aggregateByKeyRDD.collect().mkString(","))
}
}
2、combineByKey
package core.tc.spark.wordcount
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object wordCount2combineByKey {
def main(args: Array[String]): Unit = {
//combineByKey
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("File - RDD")
val sc = new SparkContext(sparkConf)
val rdd = sc.makeRDD(
List(("a", 1), ("b", 2), ("a", 1), ("b", 2), ("a", 1), ("b", 2)), 2
)
val result: RDD[(String, Int)] = rdd.combineByKey(
v => v,
(t: (Int), v) => {
t + v
},
(t1: (Int), t2: (Int)) => {
t1 + t2
}
)
println(result.collect().mkString(","))
}
}
3、foldByKey
package core.tc.spark.wordcount
import org.apache.spark.{SparkConf, SparkContext}
object wordCount3foldByKey {
def main(args: Array[String]): Unit = {
//foldByKey
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("File - RDD")
val sc = new SparkContext(sparkConf)
val rdd = sc.makeRDD(
List(("a", 1), ("b", 2), ("a", 1), ("b", 2), ("a", 1), ("b", 2)), 2
)
val result = rdd.foldByKey(0)(_ + _)
println(result.collect().mkString(","))
}
}
4、groupByKey
package core.tc.spark.wordcount
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object wordCount4groupByKey {
def main(args: Array[String]): Unit = {
//groupByKey
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("File - RDD")
val sc = new SparkContext(sparkConf)
val rdd = sc.makeRDD(
List(("a", 1), ("b", 2), ("a", 1), ("b", 2), ("a", 1), ("b", 2)), 2
)
val groupByKeyRDD: RDD[(String, Iterable[Int])] = rdd.groupByKey()
val result: RDD[(String, Int)] = groupByKeyRDD.map {
case (key, ite) => {
(key, ite.sum)
}
}
println(result.collect().mkString(","))
}
}
5、groupBy
package core.tc.spark.wordcount
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object wordCount5groupBy {
def main(args: Array[String]): Unit = {
//groupBy
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("File - RDD")
val sc = new SparkContext(sparkConf)
val rdd = sc.makeRDD(
List(("a", 1), ("b", 2), ("a", 1), ("b", 2), ("a", 1), ("b", 2)), 2
)
val groupByRDD: RDD[(String, Iterable[(String, Int)])] = rdd.groupBy(_._1)
val result = groupByRDD.mapValues(
data => {
data.map(_._2).sum
}
)
println(result.collect().mkString(","))
}
}
6、reduceByKey
package core.tc.spark.wordcount
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object wordCount6reduceByKey {
def main(args: Array[String]): Unit = {
//partitionBy
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("File - RDD")
val sc = new SparkContext(sparkConf)
val rdd = sc.makeRDD(
List(("a", 1), ("b", 2), ("a", 1), ("b", 2), ("a", 1), ("b", 2)), 2
)
val result: RDD[(String, Int)] = rdd.reduceByKey(_ + _)
println(result.collect().mkString(","))
}
}
7、aggregate
package core.tc.spark.wordcount
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
object wordCount7aggregate {
def main(args: Array[String]): Unit = {
//partitionBy
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("File - RDD")
val sc = new SparkContext(sparkConf)
val rdd = sc.makeRDD(
List(("a", 1), ("b", 2), ("a", 1), ("b", 2), ("a", 1), ("b", 2)), 2
)
val strRDD: RDD[String] = rdd.map {
case (str, sum) => {
(str + " ") * sum
}
}
val flatMapRDD = strRDD.flatMap(_.split(" "))
flatMapRDD.map(s => mutable.Map(s -> 1)).aggregate(mutable.Map[String, Int]())(
(map1: mutable.Map[String, Int], map2: mutable.Map[String, Int]) => {
map1.foldLeft(map2)(
(innerMap, kv) => {
innerMap(kv._1) = innerMap.getOrElse(kv._1, 0) + kv._2
innerMap
}
)
},
(map1: mutable.Map[String, Int], map2: mutable.Map[String, Int]) => {
map1.foldLeft(map2)(
(innerMap, kv) => {
innerMap(kv._1) = innerMap.getOrElse(kv._1, 0) + kv._2
innerMap
}
)
}
).foreach(println)
// println (result.collect().mkString(","))
}
}
8、fold
package core.tc.spark.wordcount
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
object wordCount8fold {
def main(args: Array[String]): Unit = {
//partitionBy
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("File - RDD")
val sc = new SparkContext(sparkConf)
val rdd = sc.makeRDD(
List(("a", 1), ("b", 2), ("a", 1), ("b", 2), ("a", 1), ("b", 2)), 2
)
val strRDD: RDD[String] = rdd.map {
case (str, sum) => {
(str + " ") * sum
}
}
val flatMapRDD = strRDD.flatMap(_.split(" "))
flatMapRDD.map(s => mutable.Map(s -> 1)).fold(mutable.Map[String, Int]())(
(map1: mutable.Map[String, Int], map2: mutable.Map[String, Int]) => {
map1.foldLeft(map2)(
(innerMap, kv) => {
innerMap(kv._1) = innerMap.getOrElse(kv._1, 0) + kv._2
innerMap
}
)
}
).foreach(println)
// println (result.collect().mkString(","))
}
}
9、countByKey
package core.tc.spark.wordcount
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object wordCount9countByKey {
def main(args: Array[String]): Unit = {
//partitionBy
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("File - RDD")
val sc = new SparkContext(sparkConf)
val rdd = sc.makeRDD(
List(("a", 1), ("b", 2), ("a", 1), ("b", 2), ("a", 1), ("b", 2)), 2
)
val strRDD: RDD[String] = rdd.map {
case (str, sum) => {
(str + " ") * sum
}
}
val flatMapRDD = strRDD.flatMap(_.split(" "))
val mapRDD: RDD[(String, Int)] = flatMapRDD.map((_, 1))
val result: collection.Map[String, Long] = mapRDD.countByKey()
println(result)
}
}
10、countByValue
package core.tc.spark.wordcount
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object wordCount10countByValue {
def main(args: Array[String]): Unit = {
//partitionBy
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("File - RDD")
val sc = new SparkContext(sparkConf)
val rdd = sc.makeRDD(
List(("a", 1), ("b", 2), ("a", 1), ("b", 2), ("a", 1), ("b", 2)), 2
)
val strRDD: RDD[String] = rdd.map {
case (str, sum) => {
(str + " ") * sum
}
}
val flatMapRDD = strRDD.flatMap(_.split(" "))
val result: collection.Map[String, Long] = flatMapRDD.countByValue()
// val result: RDD[(String, Int)] = rdd.reduceByKey(_ + _)
println(result)
}
}