object Spark_rdd_01 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("RDD").setMaster("local[*]")
val sc = new SparkContext(conf)
wordcount1(sc)
println("-------")
wordcount2(sc)
println("-------")
wordcount3(sc)
println("-------")
wordcount4(sc)
println("-------")
wordcount5(sc)
println("-------")
wordcount6(sc)
println("-------")
wordcount7(sc)
println("-------")
wordcount8(sc)
sc.stop()
}
//groupBy
def wordcount1(sc:SparkContext):Unit={
val rdd: RDD[String] = sc.makeRDD(List("hello spark","hello scala"))
val flatmapRDD: RDD[String] = rdd.flatMap(x=>x.split(" "))
val groupRDD: RDD[(String, Int)] = flatmapRDD.groupBy(x=>x).mapValues(_.size)
groupRDD.collect().foreach(println)
}
//groupByKey
def wordcount2(sc:SparkContext):Unit={
val rdd: RDD[String] = sc.makeRDD(List("hello spark","hello scala"))
val flatmapRDD: RDD[String] = rdd.flatMap(x=>x.split(" "))
val mapRDD: RDD[(String, Int)] = flatmapRDD.map((_,1))
val groupRDD: RDD[(String, Int)] = mapRDD.groupByKey().mapValues(_.size)
groupRDD.collect().foreach(println)
}
//reduceByKey
def wordcount3(sc:SparkContext):Unit={
val rdd: RDD[String] = sc.makeRDD(List("hello spark","hello scala"))
val flatmapRDD: RDD[String] = rdd.flatMap(x=>x.split(" "))
val mapRDD: RDD[(String, Int)] = flatmapRDD.map((_,1))
val reduceRDD: RDD[(String, Int)] = mapRDD.reduceByKey(_+_)
reduceRDD.collect().foreach(println)
}
//foldByKey
def wordcount4(sc:SparkContext):Unit={
val rdd: RDD[String] = sc.makeRDD(List("hello spark","hello scala"))
val flatmapRDD: RDD[String] = rdd.flatMap(x=>x.split(" "))
val mapRDD: RDD[(String, Int)] = flatmapRDD.map((_,1))
val foldRDD: RDD[(String, Int)] = mapRDD.foldByKey(0)(_+_)
foldRDD.collect().foreach(println)
}
//aggregateByKey
def wordcount5(sc:SparkContext):Unit={
val rdd: RDD[String] = sc.makeRDD(List("hello spark","hello scala"))
val flatmapRDD: RDD[String] = rdd.flatMap(x=>x.split(" "))
val mapRDD: RDD[(String, Int)] = flatmapRDD.map((_,1))
val aggRDD: RDD[(String, Int)] = mapRDD.aggregateByKey(0)(_+_,_+_)
aggRDD.collect().foreach(println)
}
//combineByKey
def wordcount6(sc:SparkContext):Unit={
val rdd: RDD[String] = sc.makeRDD(List("hello spark","hello scala"))
val flatmapRDD: RDD[String] = rdd.flatMap(x=>x.split(" "))
val mapRDD: RDD[(String, Int)] = flatmapRDD.map((_,1))
val comRDD: RDD[(String, Int)] = mapRDD.combineByKey(
v=>v,
(x:Int,y)=>x+y,
(x:Int,y:Int)=>x+y
)
comRDD.collect().foreach(println)
}
//countByKey
def wordcount7(sc:SparkContext):Unit={
val rdd: RDD[String] = sc.makeRDD(List("hello spark","hello scala"))
val flatmapRDD: RDD[String] = rdd.flatMap(x=>x.split(" "))
val mapRDD: RDD[(String, Int)] = flatmapRDD.map((_,1))
val countRDD: collection.Map[String, Long] = mapRDD.countByKey()
println(countRDD)
}
//countByValue
def wordcount8(sc:SparkContext):Unit={
val rdd: RDD[String] = sc.makeRDD(List("hello spark","hello scala"))
val flatmapRDD: RDD[String] = rdd.flatMap(x=>x.split(" "))
val countValRDD: collection.Map[String, Long] = flatmapRDD.countByValue()
println(countValRDD)
}
}
(spark,1)
(scala,1)
(hello,2)
-------
(spark,1)
(scala,1)
(hello,2)
-------
(spark,1)
(scala,1)
(hello,2)
-------
(spark,1)
(scala,1)
(hello,2)
-------
(spark,1)
(scala,1)
(hello,2)
-------
(spark,1)
(scala,1)
(hello,2)
-------
Map(spark -> 1, scala -> 1, hello -> 2)
-------
Map(spark -> 1, scala -> 1, hello -> 2)
reduce
object Spark_rdd_01 {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("RDD").setMaster("local[*]")
val sc = new SparkContext(conf)
val rdd: RDD[String] = sc.makeRDD(List("hello spark","hello scala"))
val value: RDD[String] = rdd.flatMap(_.split(" "))
val mapWord: RDD[mutable.Map[String, Int]] = value.map(
x => {
mutable.Map((x, 1)) //将value转换为Map类型
}
)
val wordcount: mutable.Map[String, Int] = mapWord.reduce(
(map1, map2) => {
map2.foreach {
case (word, count) => {
val newCount: Int = map1.getOrElse(word, 0) + count
map1.update(word, newCount) //更新一下map1的值
}
}
map1
}
)
println(wordcount)
sc.stop()
}
}
Map(spark -> 1, scala -> 1, hello -> 2)