1、scala的原生写法,用到了groupBy
2、spark中的写法,用到了reduceByKey
3、spark中的写法,用到了groupByKey
4、spark中的写法,用到了groupBy
5、flink中DataSteam的写法,用到了keyBy
6、flink中DataSet的写法,用到了groupBy
package com.test
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.flink.streaming.api.scala._
import org.apache.flink.api.scala.ExecutionEnvironment
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
object WordCount {
def main(args: Array[String]): Unit = {
val stringList = List("hello world", "hello scala", "hello you", "good world")
//同一时间只能运行其中一种方法,若要运行其他方法,改下方法名即可
wordCount5(stringList)
}
//scala原生写法
def wordCount1(stringList: List[String]): Unit = {
val result = stringList.flatMap(_.split(" "))
.map((_, 1))
.groupBy(_._1)
.mapValues(_.size)
.toList
.sortBy(_._2)
.foreach(println)
}
//spark + reduceByKey
def wordCount2(stringList: List[String]): Unit = {
//设置本机Spark配置
val conf = new SparkConf().setAppName("wordCount").setMaster("local")
//创建Spark上下文
val sc = new SparkContext(conf)
//从文件中获取数据
val input = sc.parallelize(stringList)
//分析并排序输出统计结果
val result = input.flatMap(_.split(" "))
.map((_, 1))
.reduceByKey(_ + _)
.sortBy(_._2, false)
.foreach(println)
}
//spark + groupByKey
def wordCount3(stringList: List[String]): Unit = {
val conf = new SparkConf().setAppName("wordCount").setMaster("local")
val sc = new SparkContext(conf)
val input = sc.parallelize(stringList)
val result = input.flatMap(_.split(" "))
.map((_, 1))
.groupByKey()
.map(x => (x._1, x._2.sum))
.sortBy(_._2)
.foreach(println)
}
//spark + groupBy
def wordCount4(stringList: List[String]): Unit = {
val conf = new SparkConf().setAppName("wordCount").setMaster("local")
val sc = new SparkContext(conf)
val input = sc.parallelize(stringList)
val result = input.flatMap(_.split(" "))
.map((_, 1))
.groupBy(_._1)
.mapValues(x => x.size)
.foreach(println)
}
//flink + DataSteam + keyBy
def wordCount5(stringList: List[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
val source = env.fromCollection(stringList)
val result = source.flatMap(_.split(" "))
.map((_, 1))
.keyBy(0)
.sum(1)
.map(println(_))
env.execute("wordCount")
}
//flink + DataSet + groupBy
def wordCount6(stringList: List[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
val source = env.fromCollection[String](stringList)
val result = source.flatMap(_.split(" "))
.map((_, 1))
.groupBy(0)
.sum(1)
result.print()
env.execute("wordCount")
}
}