spark05-实现wordcount的11种方式

object wordcount {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local").setAppName("wordcount")
    val sc: SparkContext = new SparkContext(conf)

    Wordcount11(sc)
    sc.stop()
  }
  // groupBy
  def Wordcount1(sc:SparkContext):Unit={
    val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
    val words=rdd.flatMap(_.split(" "))
    val group: RDD[(String, Iterable[String])] = words.groupBy(word=>word)
    val wordCount: RDD[(String, Int)] = group.mapValues(iter=>iter.size)
    wordCount.collect().foreach(println)
  }

  //groupByKey
  def Wordcount2(sc:SparkContext):Unit={
    val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
    val words=rdd.flatMap(_.split(" "))
    val wordOne=words.map((_,1))
    val groupBy: RDD[(String, Iterable[Int])] = wordOne.groupByKey()
    val wordCount: RDD[(String, Int)] = groupBy.mapValues(iter=>iter.size)
    wordCount.collect().foreach(println)
  }

  //reduceByKey
  def Wordcount3(sc:SparkContext):Unit={
    val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
    val words=rdd.flatMap(_.split(" "))
    val wordOne=words.map((_,1))
    val wordCount: RDD[(String, Int)] = wordOne.reduceByKey(_+_)
    wordCount.collect().foreach(println)
  }

  //aggregateByKey
  def Wordcount4(sc:SparkContext):Unit={
    val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
    val words=rdd.flatMap(_.split(" "))
    val wordOne=words.map((_,1))
    val wordCount: RDD[(String, Int)] = wordOne.aggregateByKey(0)(_+_,_+_)
    wordCount.collect().foreach(println)
  }

  //foldByKey
  def Wordcount5(sc:SparkContext):Unit={
    val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
    val words=rdd.flatMap(_.split(" "))
    val wordOne=words.map((_,1))
    val wordCount: RDD[(String, Int)] = wordOne.foldByKey(0)(_+_)
    wordCount.collect().foreach(println)
  }

  //combineByKey
  def Wordcount6(sc:SparkContext):Unit={
    val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
    val words=rdd.flatMap(_.split(" "))
    val wordOne=words.map((_,1))
    val wordCount: RDD[(String, Int)] = wordOne.combineByKey(
      v=>v,
      (x:Int,y)=>x+y,
      (t1,t2)=>t1+t2
    )
    wordCount.collect().foreach(println)
  }

  //countByKey
  def Wordcount7(sc:SparkContext):Unit={
    val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
    val words=rdd.flatMap(_.split(" "))
    val wordOne=words.map((_,1))
    val wordCount: collection.Map[String, Long] = wordOne.countByKey()
    wordCount.foreach(println)
  }

  //countByValue
  def Wordcount8(sc:SparkContext):Unit={
    val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
    val words=rdd.flatMap(_.split(" "))
    val wordCount: collection.Map[String, Long] = words.countByValue()
    wordCount.foreach(println)
  }
  //reduce
  def Wordcount9(sc:SparkContext):Unit={
    val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
    val words=rdd.flatMap(_.split(" "))

    val mapWord=words.map(word =>{
      mutable.Map[String, Long]((word, 1))
    })

    val wordCount=mapWord.reduce(
      (map1,map2) =>{
        println(map1," ",map2)
        map2.foreach{ //无法直接获取集合中的k v,所以需要循环遍历
          case(word,count)=>{
            val newCount=map1.getOrElse(word,0L)+count
            map1.update(word,newCount)
          }
        }
        map1
      }
    )
    wordCount.foreach(println)
  }
  //aggregate
  def Wordcount10(sc:SparkContext):Unit={
    val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
    val words=rdd.flatMap(_.split(" "))

    val mapWord=words.map(word =>{
      mutable.Map[String, Long]((word, 1))
    })

    val wordCount=mapWord.aggregate( mutable.Map[String, Long](("##",0)))(
      (map1,map2) =>{
        map2.foreach{ //无法直接获取集合中的k v,所以需要循环遍历
          case(word,count)=>{
            val newCount=map1.getOrElse(word,0L)+count
            map1.update(word,newCount)
          }
        }
        map1
      },
      (map1,map2) =>{
        map2.foreach{ //无法直接获取集合中的k v,所以需要循环遍历
          case(word,count)=>{
            val newCount=map1.getOrElse(word,0L)+count
            map1.update(word,newCount)
          }
        }
        map1
      }
    )
    wordCount.filterKeys(_!="##")foreach(println)
  }


  //fold
  def Wordcount11(sc:SparkContext):Unit={
    val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
    val words=rdd.flatMap(_.split(" "))

    val mapWord=words.map(word =>{
      mutable.Map[String, Long]((word, 1))
    })

    val wordCount=mapWord.fold( mutable.Map[String, Long](("##",0)))(
      (map1,map2) =>{
        map2.foreach{ //无法直接获取集合中的k v,所以需要循环遍历
          case(word,count)=>{
            val newCount=map1.getOrElse(word,0L)+count
            map1.update(word,newCount)
          }
        }
        map1
      }
    )
    wordCount.filterKeys(_!="##")foreach(println)
  }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值