Spark 实现WordCount的11种方式

基础环境

 // 创建Spark运行配置对象
    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
    // 创建Spark上下文环境对象(连接对象)
    val sc : SparkContext = new SparkContext(sparkConf)
    // 读取文件数据
    val fileRDD: RDD[String] = sc.textFile("D:\\workplace\\ifeng-Spark\\ifeng-Spark-core\\data\\wc.txt")
    val rdd = fileRDD.map(_.split(","))

    val rdd1 = sc.parallelize(List("a","b","c","d","e","a"))

1 groupBy

    //TODO 1 groupBy
      val value1: RDD[(Array[String], Iterable[Array[String]])] = rdd.groupBy(x => x)
      val res1: RDD[(Array[String], Int)] = value1.map(x => {
        (x._1, x._2.size)
      })

2 groupByKey

    //TODO 2 groupByKey
      val mapRDD2: RDD[(Array[String], Int)] = rdd.map((_, 1))
      val gKRDD2: RDD[(Array[String], Iterable[Int])] = mapRDD2.groupByKey()
      val res2: RDD[(Array[String], Int)] = gKRDD2.map(x => {
        (x._1, x._2.sum)
      })

3 ReduceByKey

    //TODO 3 ReduceByKey
      val mapRDD3: RDD[(Array[String], Int)] = rdd.map((_, 1))
      val res3: RDD[(Array[String], Int)] = mapRDD3.reduceByKey((x, y) => (x + y))
        //        .reduceByKey(_+_)

4 aggregateByKey

    //TODO 4 aggregateByKey
      val mapRDD4: RDD[(Array[String], Int)] = rdd.map((_, 1))
      val res4: RDD[(Array[String], Int)] = mapRDD4.aggregateByKey(0)(_ + _, _ + _)

5 foldByKey

    //TODO 5 foldByKey
      val mapRDD5: RDD[(Array[String], Int)] = rdd.map((_, 1))
      val res5: RDD[(Array[String], Int)] = mapRDD5.foldByKey(0)(_ + _)

6 combineByKey

    //TODO 6 combineByKey
      val mapRDD6: RDD[(Array[String], Int)] = rdd.map((_, 1))
      val res6: RDD[(Array[String], Int)] = mapRDD6.combineByKey(
        x => x, //对value做格式化
        (x: Int, y: Int) => (x + y), //同区内key仙童的两个value相加
        (m: Int, n: Int) => (m + n) //区间相同的kye两个value相加
      )
    res6.collect().foreach(println)

7 countByKey

    //TODO 7 countByKey
      val mapRDD7: RDD[(Array[String], Int)] = rdd.map((_, 1))
      val res7: collection.Map[Array[String], Long] = mapRDD7.countByKey()

8 countByValue

    //TODO 8 countByValue
      val res8: collection.Map[Array[String], Long] = rdd.countByValue()

9 reduce

    //TODO 9 reduce
      val newRDD: RDD[Map[String, Int]] = rdd1.map(word => Map[String, Int]((word, 1)))
      val reduceWC: Map[String, Int] = newRDD.reduce(
        (map1, map2) => {
          map1.foldLeft(map2)(
            (map, kv) => {
              val word = kv._1
              val count = kv._2
              map.updated(word, map.getOrElse(word, 0) + count)
            }
          )
        }
      )

10 fold

    //TODO 10 fold
      val foldWC: Map[String, Int] = newRDD.fold(Map[String, Int]())(
        (map1, map2) => {
          map1.foldLeft(map2)(
            (map, kv) => {
              val word = kv._1
              val count = kv._2
              map.updated(word, map.getOrElse(word, 0) + count)
            }
          )
        }
      )

11 aggreagre

//    println(foldWC)
    //TODO 11 aggreagre
      newRDD.aggregate(Map[String, Int]())(
        ( map,k ) => {
          map.updated(k, map.getOrElse(k, 0) + 1) //分区内+1
        },
        (map1, map2) => {
          map1.foldLeft(map2)(
            (map, kv) => {
              val word = kv._1
              val count = kv._2
              map.updated(word, map.getOrElse(word, 0) + count) //分区间Count
            }
          )
        }


//      )





  }

}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

oifengo

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值