数据文件内容为 spark spark hello spark ... scala spark spark //TODO 第一种 : aggregate package com.bigdata.spark.core.wordcount12 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD import scala.collection.mutable /** * @author shkstart * @create 2020-09-06 22:10 */ object aggregate { def main(args: Array[String]): Unit = { val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("aggregate") val sc = new SparkContext(sparkconf) val rdd: RDD[String] = sc.textFile("data/word.txt") val flatRDD: RDD[String] = rdd.flatMap(_.split(" ")) val mapRDD: RDD[(String, Int)] = flatRDD.map((_,1)) val stringToInt: Map[String, Int] = mapRDD.aggregate(Map[String, Int]())((kv1, kv2) => { val v: Int = kv1.getOrElse(kv2._1, 0) + kv2._2 kv1.updated(kv2._1, v) }, (map1, map2) => { //Map1{(),(),()} //Map2{(),(),()} map1.foldLeft(map2) { case (map,(k,v))=>{ map+(k->(map.getOrElse(k,0)+v)) } } }) println(stringToInt) } }
//TODO 第二种 : aggregateByKey package com.bigdata.spark.core.wordcount12 import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} /** * @author shkstart * @create 2020-09-06 18:43 */ object aggregateByKey { def main(args: Array[String]): Unit = { val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("aggregateByKey") val sc = new SparkContext(sparkconf) val rdd: RDD[String] = sc.textFile("data/word.txt") val flatRDD: RDD[String] = rdd.flatMap(_.split(" ")) val map1RDD: RDD[(String, Int)] = flatRDD.map((_,1)) val aggRDD: RDD[(String, Int)] = map1RDD.aggregateByKey(0)(_+_,_+_) aggRDD.collect.foreach(println) sc.stop() } }
//TODO 第三种 : cogroup package com.bigdata.spark.core.wordcount12 import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} /** * @author shkstart * @create 2020-09-06 18:57 */ object cogroup { def main(args: Array[String]): Unit = { val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("cogroup") val sc = new SparkContext(sparkconf) val rdd: RDD[String] = sc.textFile("data/word.txt") val flatRDD: RDD[String] = rdd.flatMap(_.split(" ")) val mapRDD: RDD[(String, Int)] = flatRDD.map((_,1)) val cogroupRDD: RDD[(String, (Iterable[Int], Iterable[Int]))] = mapRDD.cogroup(mapRDD) val map1RDD: RDD[(String, Int)] = cogroupRDD.map(kv => { val num: Int = kv._2._1.size (kv._1, num) }) map1RDD.collect.foreach(println) sc.stop() } }
//TODO 第四种 : combineByKey package com.bigdata.spark.core.wordcount12 import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} /** * @author shkstart * @create 2020-09-06 18:48 */ object combineByKey { def main(args: Array[String]): Unit = { val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("combineByKey") val sc = new SparkContext(sparkconf) val rdd: RDD[String] = sc.textFile("data/word.txt") val flatRDD: RDD[String] = rdd.flatMap(_.split(" ")) val mapRDD: RDD[(String, Int)] = flatRDD.map((_,1)) val comRDD: RDD[(String, Int)] = mapRDD.combineByKey(num => num, (x: Int, y: Int) => (x + y), (x: Int, y: Int) => (x + y) ) comRDD.collect.foreach(println) sc.stop() } }
//TODO 第五种 : countByKey package com.bigdata.spark.core.wordcount12 import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} /** * @author shkstart * @create 2020-09-06 18:52 */ object countByKey { def main(args: Array[String]): Unit = { val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("countByKey") val sc = new SparkContext(sparkconf) val rdd: RDD[String] = sc.textFile("data/word.txt") val flatRDD: RDD[String] = rdd.flatMap(_.split(" ")) val mapRDD: RDD[(String, Int)] = flatRDD.map((_,1)) mapRDD.countByKey().foreach(println) sc.stop() } }
//TODO 第六种 : countByValue package com.bigdata.spark.core.wordcount12 import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} /** * @author shkstart * @create 2020-09-06 18:55 */ object countByValue { def main(args: Array[String]): Unit = { val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("countByValue") val sc = new SparkContext(sparkconf) val rdd: RDD[String] = sc.textFile("data/word.txt") val flatRDD: RDD[String] = rdd.flatMap(_.split(" ")) flatRDD.countByValue().foreach(println) sc.stop() } }
//TODO 第七种 : fold package com.bigdata.spark.core.wordcount12 import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} /** * @author shkstart * @create 2020-09-06 23:34 */ object fold { def main(args: Array[String]): Unit = { val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("fold") val sc = new SparkContext(sparkconf) val rdd: RDD[String] = sc.textFile("data/word.txt") val flatRDD: RDD[String] = rdd.flatMap(_.split(" ")) val mapRDD: RDD[Map[String, Int]] = flatRDD.map(str => { Map[String, Int](str->1) }) val stringToInt: Map[String, Int] = mapRDD.fold(Map[String, Int]())((map1, map2) => { //( map1 /: map2 ) { case (map, (k,v)) => map + ( k -> (v + map.getOrElse(k, 0)) ) } //Map1{(),(),()} //Map2{(),(),()} map1.foldLeft(map2) { case (map, (k, v)) => { val newv: Int = map.getOrElse(k, 0) + v map.updated(k, newv) } } }) println(stringToInt) sc.stop() } }
//TODO 第八种 : foldByKey package com.bigdata.spark.core.wordcount12 import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} /** * @author shkstart * @create 2020-09-06 18:45 */ object foldByKey { def main(args: Array[String]): Unit = { val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("foldByKey") val sc = new SparkContext(sparkconf) val rdd: RDD[String] = sc.textFile("data/word.txt") val flatRDD: RDD[String] = rdd.flatMap(_.split(" ")) val map1RDD: RDD[(String, Int)] = flatRDD.map((_,1)) val foldRDD: RDD[(String, Int)] = map1RDD.foldByKey(0)(_+_) foldRDD.collect.foreach(println) sc.stop() } }
//TODO 第九种 : groupBy package com.bigdata.spark.core.wordcount12 import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} /** * @author shkstart * @create 2020-09-06 18:18 */ object groupBy { def main(args: Array[String]): Unit = { val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("groupBy") val sc = new SparkContext(sparkconf) val rdd: RDD[String] = sc.textFile("data/word.txt") val flatRDD: RDD[String] = rdd.flatMap(_.split(" ")) val groupRDD: RDD[(String, Iterable[String])] = flatRDD.groupBy(word=>word) val mapRDD: RDD[(String, Int)] = groupRDD.map(kv => (kv._1, kv._2.size) ) mapRDD.collect.foreach(println) sc.stop() } }
//TODO 第十种 : groupByKey package com.bigdata.spark.core.wordcount12 import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} /** * @author shkstart * @create 2020-09-06 18:39 */ object groupByKey { def main(args: Array[String]): Unit = { val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("groupByKey" ) val sc = new SparkContext(sparkconf) val rdd: RDD[String] = sc.textFile("data/word.txt") val flatRDD: RDD[String] = rdd.flatMap(_.split(" ")) val mapRDD: RDD[(String, Int)] = flatRDD.map((_,1)) val groupRDD: RDD[(String, Iterable[Int])] = mapRDD.groupByKey() val map1RDD: RDD[(String, Int)] = groupRDD.map(kv=>(kv._1,kv._2.size)) map1RDD.collect.foreach(println) sc.stop() } }
//TODO 第十一种 : reduce package com.bigdata.spark.core.wordcount12 import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD /** * @author shkstart * @create 2020-09-07 0:01 */ object reduce { def main(args: Array[String]): Unit = { val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("reduce" ) val sc = new SparkContext(sparkconf) val rdd: RDD[String] = sc.textFile("data/word.txt") val flatRDD: RDD[String] = rdd.flatMap(_.split(" ")) val mapRDD: RDD[Map[String, Int]] = flatRDD.map(str => { Map[String, Int](str->1) }) val stringToInt: Map[String, Int] = mapRDD.reduce((map1, map2) => { //Map1{(),(),()} //Map2{(),(),()} map1.foldLeft(map2) { case (map, (k, v)) => { val newv: Int = map.getOrElse(k, 0) + v map.updated(k, newv) } } }) println(stringToInt) } }
//TODO 第十二种 : reduceByKey package com.bigdata.spark.core.wordcount12 import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} /** * @author shkstart * @create 2020-09-06 18:33 */ object reduceByKey { def main(args: Array[String]): Unit = { val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("reduceByKey") val sc = new SparkContext(sparkconf) val rdd: RDD[String] = sc.textFile("data/word.txt") val flatRDD: RDD[String] = rdd.flatMap(_.split(" ")) val mapRDD: RDD[(String, Int)] = flatRDD.map((_,1)) val reduceByKey: RDD[(String, Int)] = mapRDD.reduceByKey(_+_) reduceByKey.collect.foreach(println) sc.stop() } }