val lines=Array(“spark hadoop spark flink”,“hadoop flink flink spark”,“hbase flink”)
lines: Array[String] = Array(spark hadoop spark flink, hadoop flink flink spark, hbase flink)
按逗号切分数组
lines.map(x=>x.split(" "))
Array[Array[String]] = Array(Array(spark, hadoop, spark, flink), Array(hadoop, flink, flink, spark), Array(hbase, flink))
扁平化:
lines.map(x=>x.split(" ")).flatten
res1: Array[String] = Array(spark, hadoop, spark, flink, hadoop, flink, flink, spark, hbase, flink)
map+flatten=flatMap
这里两部操作可以并为一步lines.flatMap(_.split(""))
生成word,1的对偶元组,这里array里的元素类型就是tuple
lines.flatMap(_.split(" ")).map(x=>(x,1))
res4: Array[(String, Int)] = Array((spark,1), (hadoop,1), (spark,1), (flink,1), (hadoop,1), (flink,1), (flink,1), (spark,1), (hbase,1), (flink,1))
将word相同的元组放在同一个组里面:
lines.flatMap(_.split(" ")).map(x=>(x,1)).groupBy(x=>x._1)
res5: scala.collection.immutable.Map[String,Array[(String, Int)]] = Map(hadoop -> Array((hadoop,1), (hadoop,1)), spark -> Array((spark,1), (spark,1), (spark,1)), flink -> Array((flink,1), (flink,1), (flink,1), (flink,1)), hbase -> Array((hbase,1)))
通过求数组的length来求和:
lines.flatMap(_.split(" ")).map(x=>(x,1)).groupBy(x=>x._1).map(x=>(x._1,x._2.length))
res6: scala.collection.immutable.Map[String,Int] = Map(hadoop -> 2, spark -> 3, flink -> 4, hbase -> 1)
转化成List
lines.flatMap(_.split(" ")).map(x=>(x,1)).groupBy(x=>x._1).map(x=>(x._1,x._2.length)).toList
res7: List[(String, Int)] = List((hadoop,2), (spark,3), (flink,4), (hbase,1))
默认升序排序
lines.flatMap(_.split(" ")).map(x=>(x,1)).groupBy(x=>x._1).map(x=>(x._1,x._2.length)).toList.sortBy(x=>x._2)
res8: List[(String, Int)] = List((hbase,1), (hadoop,2), (spark,3), (flink,4))
reverse将升序转化成降序
lines.flatMap(_.split(" ")).map(x=>(x,1)).groupBy(x=>x._1).map(x=>(x._1,x._2.length)).toList.sortBy(x=>x._2).reverse
res9: List[(String, Int)] = List((flink,4), (spark,3), (hadoop,2), (hbase,1))
另外一种写法:
lines.flatMap(_.split(" "))
res0: Array[String] = Array(spark, hadoop, spark, flink, hadoop, flink, flink, spark, hbase, flink)
lines.flatMap(_.split(" ")).groupBy(x=>x)
scala.collection.immutable.Map[String,Array[String]] = Map(hadoop -> Array(hadoop, hadoop), spark -> Array(spark, spark, spark), flink -> Array(flink, flink, flink, flink), hbase -> Array(hbase))
lines.flatMap(.split(" ")).groupBy(x=>x).mapValues(.length)
res2: scala.collection.immutable.Map[String,Int] = Map(hadoop -> 2, spark -> 3, flink -> 4, hbase -> 1)
后面类似,自行补充