wordCount
groupBy
def wordCount1(rdd: RDD[String]) = {
val words: RDD[String] = rdd.flatMap(_.split("\\s+"))
val wordGroupByItself: RDD[(String, Iterable[String])] = words.groupBy(word => word)
val wordCount: Map[String, Int] = wordGroupByItself.mapValues(_.size).collect().toMap
println(wordCount)
}
groupByKey
def wordCount3(rdd: RDD[String]) = {
val words: RDD[String] = rdd.flatMap(_.split("\\s+"))
val wordByOne: RDD[(String, Int)] = words.map(word => word -> 1)
val wordCount: Map[String, Int] = wordByOne.groupByKey().mapValues(_.toList.size).collect().toMap
}
reduceByKey
def wordCount4(rdd: RDD[String]) = {
val words: RDD[String] = rdd.flatMap(_.split(("\\s+")))
val wordOne: RDD[(String, Int)] = words.map((_, 1))
val wordCount: RDD[(String, Int)] = wordOne.reduceByKey(_ + _)
val wordCountResult: Map[String, Int] = wordCount.collect().toMap
println(wordCountResult)
}
aggregateByKey
def wordCount5(rdd: RDD[String]) = {
val words: RDD[String] = rdd.flatMap(_.split(("\\s+")))
val wordOne: RDD[(String, Int)] = words.map((_, 1))
val wordCount: RDD[(String, Int)] = wordOne.aggregateByKey(0)(_ + _, _ + _)
val wordCountResult: Map[String, Int] = wordCount.collect().toMap
println(wordCountResult)
}
foleByKey
def wordCount6(rdd: RDD[String]) = {
val words: RDD[String] = rdd.flatMap(_.split(("\\s+")))
val wordOne: RDD[(String, Int)] = words.map((_, 1))
val wordCount: RDD[(String, Int)] = wordOne.foldByKey(0)(_ + _)
val wordCountResult: Map[String, Int] = wordCount.collect().toMap
println(wordCountResult)
}
CombineByKey
def wordCount7(rdd: RDD[String]) = {
val words: RDD[String] = rdd.flatMap(_.split(("\\s+")))
val wordOne: RDD[(String, Int)] = words.map((_, 1))
val wordCount: RDD[(String, Int)] = wordOne.combineByKey(
word => word,
(x: Int, y: Int) => x + y,
(x: Int, y: Int) => x + y
)
val wordCountResult: Map[String, Int] = wordCount.collect().toMap
println(wordCountResult)
}
countByKey
def wordCount8(rdd: RDD[String]) = {
val words: RDD[String] = rdd.flatMap(_.split(("\\s+")))
val wordOne: RDD[(String, Int)] = words.map((_, 1))
val wordCount: scala.collection.Map[String, Long] = wordOne.countByKey()
println(wordCount)
}
countByValue
private def wordCount2(rdd: RDD[String]): Unit = {
val words: RDD[String] = rdd.flatMap(_.split("\\s+"))
val wordCount: collection.Map[String, Long] = words.countByValue()
println(wordCount)
}