Spark实现WordCount的11种方式

 方法1: groupBy

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

object WordCount_01_groupBy {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
    val sc = new SparkContext(conf)
    //准备数据
    val rdd: RDD[(String, Int)] = sc.makeRDD(
      List(
        ("a", 1), ("a", 2), ("b", 3),
        ("b", 4), ("b", 5), ("a", 6)
      ), 2
    )
    rdd.groupBy(_._1).mapValues(_.map(_._2).sum).collect().foreach(println)
    sc.stop()
  }
}

方式2: groupByKey

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount_02_groupByKey {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
    val sc = new SparkContext(conf)
    //准备数据
    val rdd: RDD[(String, Int)] = sc.makeRDD(
      List(
        ("a", 1), ("a", 2), ("b", 3),
        ("b", 4), ("b", 5), ("a", 6)
      ), 2
    )
    rdd.groupByKey().mapValues(_.sum).collect().foreach(println)
    sc.stop()
  }

}

方式3: reduceByKey

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount_03_reduceByKey {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
    val sc = new SparkContext(conf)
    //准备数据
    val rdd: RDD[(String, Int)] = sc.makeRDD(
      List(
        ("a", 1), ("a", 2), ("b", 3),
        ("b", 4), ("b", 5), ("a", 6)
      ), 2
    )
    rdd.reduceByKey(_+_).collect().foreach(println)
    sc.stop()
  }
}

方式4:aggregateByKey

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount_04_aggregateByKey {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
    val sc = new SparkContext(conf)
    //准备数据
    val rdd: RDD[(String, Int)] = sc.makeRDD(
      List(
        ("a", 1), ("a", 2), ("b", 3),
        ("b", 4), ("b", 5), ("a", 6)
      ), 2
    )
    rdd.aggregateByKey(0)(_+_,_+_).collect().foreach(println)
    sc.stop()
  }
}

方式5:foldByKey

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount_05_foldByKey {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
    val sc = new SparkContext(conf)
    //准备数据
    val rdd: RDD[(String, Int)] = sc.makeRDD(
      List(
        ("a", 1), ("a", 2), ("b", 3),
        ("b", 4), ("b", 5), ("a", 6)
      ), 2
    )
    rdd.foldByKey(0)(_+_).collect().foreach(println)
    sc.stop()
  }
}

方式6:combineByKey

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount_06_combineByKey {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
    val sc = new SparkContext(conf)
    //准备数据
    val rdd: RDD[(String, Int)] = sc.makeRDD(
      List(
        ("a", 1), ("a", 2), ("b", 3),
        ("b", 4), ("b", 5), ("a", 6)
      ), 2
    )
    rdd.combineByKey(
      v=>v,
      (t1:Int,v) => t1 + v,
      (t1:Int,t2:Int) => t1 + t2
    ).collect().foreach(println)
    sc.stop()
  }

}

方式7:countByKey

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount_07_countByKey {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
    val sc = new SparkContext(conf)
    //准备数据
    val rdd: RDD[(String, Int)] = sc.makeRDD(
      List(
        ("a", 1), ("a", 2), ("b", 3),
        ("b", 4), ("b", 5), ("a", 6)
      ), 2
    )
    rdd.map(data => {
      (data._1 + " ") * data._2
    }).flatMap(_.split(" ")).map((_,1)).countByKey().foreach(println)
    sc.stop()
  }
}

方式8:countByValue

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount_08_countByValue {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
    val sc = new SparkContext(conf)
    //准备数据
    val rdd: RDD[(String, Int)] = sc.makeRDD(
      List(
        ("a", 1), ("a", 2), ("b", 3),
        ("b", 4), ("b", 5), ("a", 6)
      ), 2
    )
    rdd.map(data => {
      (data._1 + " ") * data._2
    }).flatMap(_.split(" ")).countByValue().foreach(println)
    sc.stop()
  }
}

方式9:aggregate

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
object WordCount_09_aggregate {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
    val sc = new SparkContext(conf)
    //准备数据
    val rdd: RDD[(String, Int)] = sc.makeRDD(
      List(
        ("a", 1), ("a", 2), ("b", 3),
        ("b", 4), ("b", 5), ("a", 6)
      ), 2
    )
    rdd.map(data => mutable.Map(data._1 -> data._2)).aggregate(mutable.Map[String,Int]())(
      (map1:mutable.Map[String,Int], map2:mutable.Map[String,Int]) => {
        map1.foldLeft(map2)(
          (innermap,kv) => {
            innermap(kv._1) = innermap.getOrElse(kv._1,0) + kv._2
            innermap
          }
        )
      },
      (map1:mutable.Map[String,Int],map2:mutable.Map[String,Int]) =>{
        map1.foldLeft(map2)(
          (innermap,kv)=>{
            innermap(kv._1) = innermap.getOrElse(kv._1,0) + kv._2
            innermap
          }
        )
      }
    ).foreach(println)
    sc.stop()
  }
}

方式10:fold

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable

object WordCount_10_fold {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
    val sc = new SparkContext(conf)
    //准备数据
    val rdd: RDD[(String, Int)] = sc.makeRDD(
      List(
        ("a", 1), ("a", 2), ("b", 3),
        ("b", 4), ("b", 5), ("a", 6)
      ), 2
    )
    rdd.map(data => mutable.Map(data._1 -> data._2)).fold(mutable.Map[String,Int]())(
      (map1:mutable.Map[String,Int],map2:mutable.Map[String,Int])=>{
        map1.foldLeft(map2)(
          (resmap,kv) => {
            resmap(kv._1) = resmap.getOrElse(kv._1,0) + kv._2
            resmap
          }
        )
      }
    ).foreach(println)
    sc.stop()
  }
}

方式11: reduce+foldleft

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable

object WordCount_11_reduce_foldleft {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount")
    val sc = new SparkContext(conf)
    //准备数据
    val rdd: RDD[(String, Int)] = sc.makeRDD(
      List(
        ("a", 1), ("a", 2), ("b", 3),
        ("b", 4), ("b", 5), ("a", 6)
      ), 2
    )
    rdd.map(data => mutable.Map(data._1 -> data._2)).reduce((map1,map2)=>{
      map1.foldLeft(map2)(
        (resmap,kv) => {
          resmap(kv._1) = resmap.getOrElse(kv._1, 0) + kv._2
          resmap
        }
      )
    }).foreach(println)
    sc.stop()
  }
}

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值