package com.huc.WordCount
object WordCount3 {
def main(args: Array[String]): Unit = {
// 原始数据
val tupleList = List(("Hello Scala Spark World ", 4), ("Hello Scala Spark ", 3), ("Hello Scala ", 2), ("Hello ", 1))
// 步骤一:转换为(单词,次数),(单词,次数)的格式
println(tupleList.flatMap((tuple: (String, Int)) => {
tuple._1.split(" ").map((s: String) => (s, tuple._2))
}))
// 使用flatMap
val tuples = tupleList.flatMap((tuple: (String, Int)) => {
tuple._1.split(" ").map((s: String) => (s, tuple._2))
})
println(tuples)
// 步骤二:将相同的单词聚合在一起
val map = tuples.groupBy((tuples: (String, Int)) => tuples._1)
println(map)
// 步骤三:转化格式为(单词,sum次数)
val map1 = map.mapValues((list: List[(String, Int)]) => {
// 将集合 List((Hello,4), (Hello,3), (Hello,2), (Hello,1)) 转换为集合List(1,2,3,4)
// 第一种方法
// list.map((tuple: (String, Int)) => tuple._2).sum
// 第二种方法(下面的更加通用一些)
list.foldLeft(0)((res: Int, elem: (String, Int)) => res + elem._2)
})
println(map1)
// 步骤四:排序取top3
val tuples1 = map1.toList.sortBy((tuple: (String, Int)) => tuple._2)(Ordering[Int].reverse)
println(tuples1)
println(tuples1.take(3))
}
}
package com.huc.WordCount
object WordCount6 {
def main(args: Array[String]): Unit = {
// 原始数据
val tupleList = List(("Hello Scala Spark World", 4), ("Hello Scala Spark", 3), ("Hello Scala", 2), ("Hello ", 1))
val tuples: List[(String, Int)] = tupleList.flatMap((tuple: (String, Int)) => tuple._1.split(" ").map((s: String) => (s, tuple._2)))
val map: Map[String, List[(String, Int)]] = tuples.groupBy((tuple: (String, Int)) => tuple._1)
println(map)
val map1: Map[String, Int] = map.mapValues((list: List[(String, Int)]) => {
list.map((tuple: (String, Int)) => tuple._2).sum
})
println(map1)
println(map1.toList.sortWith(_._2 > _._2).take(3).mkString(" ,"))
// 一步写完
println(tupleList.flatMap((tuple: (String, Int)) => tuple._1.split(" ").map((s: String) => (s, tuple._2)))
.groupBy((tuple: (String, Int)) => tuple._1)
.mapValues((list: List[(String, Int)]) => list.map((tuple: (String, Int)) => tuple._2).sum)
.toList.sortWith(_._2 > _._2).take(3).mkString(" ,"))
}
}