spark-WordCount 源码分析图解
1. maven依赖
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.3.4</version>
<exclusions>
<exclusion>
<groupId>io.netty</groupId>
<artifactId>netty</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>io.netty</groupId>
<artifactId>netty-all</artifactId>
<version>4.1.18.Final</version>
</dependency>
</dependencies>
2. scala代码
package spark
import org.apache.spark.{SparkConf, SparkContext}
object WordCount {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("WordCount")
conf.setMaster("local")
val sc = new SparkContext(conf)
val fileRDD = sc.textFile("data/word")
val words = fileRDD.flatMap((x: String) => {
x.split(" ")
})
val pariWord = words.map((x: String) => {
new Tuple2(x, 1)
})
val res = pariWord.reduceByKey((x: Int, y: Int) => {
x + y
})
val fanzhuan = res.map((x) => {
(x._2, 1)
})
val value = fanzhuan.reduceByKey(_ + _)
value.foreach(println)
res.foreach(println)
Thread.sleep(Long.MaxValue)
}
}
图解: