1.辣行代码
lines.flatMap(_.split(" ")).map((_, 1)).groupBy(_._1).map(tuple => (tuple._1, tuple._2.size)).toList.sortBy(_._2)
2.代码分解
package com.w4xj.scala.study0502
object WorldCount {
def main(args: Array[String]): Unit = {
val lines = List("scala spark storm hadoop sqoop flume",
"kafka scala spark oozie scala spark zookeeper",
"hadoop hive hbase scala")
//1.扁平化,即切割
//val result1 = lines.flatMap((str:String ) => str.split(""))
//简写
val result1 = lines.flatMap(_.split(" "))
//2.组成对偶
//result1.map((str:String) => (str,1))
//简写
val result2 = result1.map((_, 1))
//3.分组,注意元祖的下标是1开始
//result2.groupBy((tuple: (String, Int)) => tuple._1)
//简写
val result3 = result2.groupBy(_._1)
//4.统计,即reduce阶段的统计
//val result4 = result3.map((tuple: (String, List[(String, Int)])) => (tuple._1, tuple._2.size))
//简写,这里的变量不能省略,因为函数体里用了2次
val result4 = result3.map(tuple => (tuple._1, tuple._2.size))
//排序,将map转为元祖的List进行排序
//result4.toList.sortBy((tuple:(String,Int)) => tuple._2)
//简写
val result5 = result4.toList.sortBy(_._2)
//打印:result5 = List((storm,1), (sqoop,1), (oozie,1), (kafka,1), (hive,1), (zookeeper,1), (flume,1), (hbase,1), (hadoop,2), (spark,3), (scala,4))
println("result5 = " + result5)
println("=========================================")
//一条
val scalaWorldCount = lines.flatMap(_.split(" ")).map((_, 1)).groupBy(_._1).map(tuple => (tuple._1, tuple._2.size)).toList.sortBy(_._2)
//打印:scalaWorldCount = List((storm,1), (sqoop,1), (oozie,1), (kafka,1), (hive,1), (zookeeper,1), (flume,1), (hbase,1), (hadoop,2), (spark,3), (scala,4))
println("scalaWorldCount = " + scalaWorldCount)
}
}