基础环境
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
val sc : SparkContext = new SparkContext(sparkConf)
val fileRDD: RDD[String] = sc.textFile("D:\\workplace\\ifeng-Spark\\ifeng-Spark-core\\data\\wc.txt")
val rdd = fileRDD.map(_.split(","))
val rdd1 = sc.parallelize(List("a","b","c","d","e","a"))
1 groupBy
val value1: RDD[(Array[String], Iterable[Array[String]])] = rdd.groupBy(x => x)
val res1: RDD[(Array[String], Int)] = value1.map(x => {
(x._1, x._2.size)
})
2 groupByKey
val mapRDD2: RDD[(Array[String], Int)] = rdd.map((_, 1))
val gKRDD2: RDD[(Array[String], Iterable[Int])] = mapRDD2.groupByKey()
val res2: RDD[(Array[String], Int)] = gKRDD2.map(x => {
(x._1, x._2.sum)
})
3 ReduceByKey
val mapRDD3: RDD[(Array[String], Int)] = rdd.map((_, 1))
val res3: RDD[(Array[String], Int)] = mapRDD3.reduceByKey((x, y) => (x + y))
4 aggregateByKey
val mapRDD4: RDD[(Array[String], Int)] = rdd.map((_, 1))
val res4: RDD[(Array[String], Int)] = mapRDD4.aggregateByKey(0)(_ + _, _ + _)
5 foldByKey
val mapRDD5: RDD[(Array[String], Int)] = rdd.map((_, 1))
val res5: RDD[(Array[String], Int)] = mapRDD5.foldByKey(0)(_ + _)
6 combineByKey
val mapRDD6: RDD[(Array[String], Int)] = rdd.map((_, 1))
val res6: RDD[(Array[String], Int)] = mapRDD6.combineByKey(
x => x,
(x: Int, y: Int) => (x + y),
(m: Int, n: Int) => (m + n)
)
res6.collect().foreach(println)
7 countByKey
val mapRDD7: RDD[(Array[String], Int)] = rdd.map((_, 1))
val res7: collection.Map[Array[String], Long] = mapRDD7.countByKey()
8 countByValue
val res8: collection.Map[Array[String], Long] = rdd.countByValue()
9 reduce
val newRDD: RDD[Map[String, Int]] = rdd1.map(word => Map[String, Int]((word, 1)))
val reduceWC: Map[String, Int] = newRDD.reduce(
(map1, map2) => {
map1.foldLeft(map2)(
(map, kv) => {
val word = kv._1
val count = kv._2
map.updated(word, map.getOrElse(word, 0) + count)
}
)
}
)
10 fold
val foldWC: Map[String, Int] = newRDD.fold(Map[String, Int]())(
(map1, map2) => {
map1.foldLeft(map2)(
(map, kv) => {
val word = kv._1
val count = kv._2
map.updated(word, map.getOrElse(word, 0) + count)
}
)
}
)
11 aggreagre
newRDD.aggregate(Map[String, Int]())(
( map,k ) => {
map.updated(k, map.getOrElse(k, 0) + 1)
},
(map1, map2) => {
map1.foldLeft(map2)(
(map, kv) => {
val word = kv._1
val count = kv._2
map.updated(word, map.getOrElse(word, 0) + count)
}
)
}
}
}