spark-WordCount 源码分析图解

Angzush

于 2022-09-14 17:40:15 发布

阅读量696

点赞数

文章标签： spark 大数据 scala

本文链接：https://blog.csdn.net/Angzush/article/details/126856703

版权

spark-WordCount 源码分析图解

1. maven依赖

    <dependencies>
        <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.11</artifactId>
            <version>2.3.4</version>
            <exclusions>
                <exclusion>
                    <groupId>io.netty</groupId>
                    <artifactId>netty</artifactId>
                </exclusion>
            </exclusions>
        </dependency>

        <!--解决io.netty.buffer.PooledByteBufAllocator.defaultUseCacheForAllThreads()Z异常-->
        <dependency>
            <groupId>io.netty</groupId>
            <artifactId>netty-all</artifactId>
            <version>4.1.18.Final</version>
        </dependency>
    </dependencies>

2. scala代码

package spark

import org.apache.spark.{SparkConf, SparkContext}

object WordCount {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf()
    //    设置spark程序的运行名称
    conf.setAppName("WordCount")
    //    设置spark是本地运行还是集群运行
    conf.setMaster("local")

    val sc = new SparkContext(conf)
    //  单词统计
    val fileRDD = sc.textFile("data/word")
    //    数据是： hello word
    //    val words = fileRDD.flatMap(_.split(" "))  简写
    val words = fileRDD.flatMap((x: String) => {
      x.split(" ")
    })
    //数据 :
    // hello
    // word
    //    然后再转成tuple键值对  hello 1 ,word 1
    //    val pariWord = words.map(new Tuple2(_,1))
    val pariWord = words.map((x: String) => {
      new Tuple2(x, 1)
    })
    // 然后转成 hello 2, word 1,Angzush 2
    val res = pariWord.reduceByKey((x: Int, y: Int) => {
      x + y
    })

    //    如果我想转为类似hello 拥有两次字符的有几个
    //    那res的结果进行map
    val fanzhuan = res.map((x) => {
      (x._2, 1)
    })
    //    然后再进行reduceByKey
    val value = fanzhuan.reduceByKey(_ + _)
    value.foreach(println)
    res.foreach(println)

//    为了进入localhost:4040而休眠 
    Thread.sleep(Long.MaxValue)
  }
}