spark的12种wordcount

数据文件内容为
spark spark
hello spark
...
scala spark spark


//TODO 第一种 : aggregate
package com.bigdata.spark.core.wordcount12
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import scala.collection.mutable
/**
 * @author shkstart
 * @create 2020-09-06 22:10
 */
object aggregate {
  def main(args: Array[String]): Unit = {
    val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("aggregate")
    val sc = new SparkContext(sparkconf)
    val rdd: RDD[String] = sc.textFile("data/word.txt")
    val flatRDD: RDD[String] = rdd.flatMap(_.split(" "))
    val mapRDD: RDD[(String, Int)] = flatRDD.map((_,1))
    val stringToInt: Map[String, Int] = mapRDD.aggregate(Map[String, Int]())((kv1, kv2) => {
      val v: Int = kv1.getOrElse(kv2._1, 0) + kv2._2
      kv1.updated(kv2._1, v)
    }, (map1, map2) => {
      //Map1{(),(),()}
      //Map2{(),(),()}
      map1.foldLeft(map2) {
        case (map,(k,v))=>{
          map+(k->(map.getOrElse(k,0)+v))
        }
      }
    })
    println(stringToInt)
  }
}

 

 

//TODO 第二种 : aggregateByKey 
package com.bigdata.spark.core.wordcount12
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
 * @author shkstart
 * @create 2020-09-06 18:43
 */
object aggregateByKey {
  def main(args: Array[String]): Unit = {
    val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("aggregateByKey")
    val sc = new SparkContext(sparkconf)
    val rdd: RDD[String] = sc.textFile("data/word.txt")
    val flatRDD: RDD[String] = rdd.flatMap(_.split(" "))
    val map1RDD: RDD[(String, Int)] = flatRDD.map((_,1))
    val aggRDD: RDD[(String, Int)] = map1RDD.aggregateByKey(0)(_+_,_+_)
    aggRDD.collect.foreach(println)
    sc.stop()
  }
}

 

 

 

//TODO 第三种 : cogroup
package com.bigdata.spark.core.wordcount12
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
 * @author shkstart
 * @create 2020-09-06 18:57
 */
object cogroup {
  def main(args: Array[String]): Unit = {
    val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("cogroup")
    val sc = new SparkContext(sparkconf)
    val rdd: RDD[String] = sc.textFile("data/word.txt")
    val flatRDD: RDD[String] = rdd.flatMap(_.split(" "))
    val mapRDD: RDD[(String, Int)] = flatRDD.map((_,1))
    val cogroupRDD: RDD[(String, (Iterable[Int], Iterable[Int]))] = mapRDD.cogroup(mapRDD)
    val map1RDD: RDD[(String, Int)] = cogroupRDD.map(kv => {
      val num: Int = kv._2._1.size
      (kv._1, num)
    })
    map1RDD.collect.foreach(println)
    sc.stop()
  }
}

 

//TODO 第四种 : combineByKey
package com.bigdata.spark.core.wordcount12
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
 * @author shkstart
 * @create 2020-09-06 18:48
 */
object combineByKey {
  def main(args: Array[String]): Unit = {
    val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("combineByKey")
    val sc = new SparkContext(sparkconf)
    val rdd: RDD[String] = sc.textFile("data/word.txt")
    val flatRDD: RDD[String] = rdd.flatMap(_.split(" "))
    val mapRDD: RDD[(String, Int)] = flatRDD.map((_,1))
    val comRDD: RDD[(String, Int)] = mapRDD.combineByKey(num => num,
      (x: Int, y: Int) => (x + y),
      (x: Int, y: Int) => (x + y)
    )
    comRDD.collect.foreach(println)
    sc.stop()
  }
}

 

 

//TODO 第五种 : countByKey
package com.bigdata.spark.core.wordcount12
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
 * @author shkstart
 * @create 2020-09-06 18:52
 */
object countByKey {
  def main(args: Array[String]): Unit = {
    val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("countByKey")
    val sc = new SparkContext(sparkconf)
    val rdd: RDD[String] = sc.textFile("data/word.txt")
    val flatRDD: RDD[String] = rdd.flatMap(_.split(" "))
    val mapRDD: RDD[(String, Int)] = flatRDD.map((_,1))
    mapRDD.countByKey().foreach(println)
    sc.stop()
  }
}

 

 

//TODO 第六种 : countByValue
package com.bigdata.spark.core.wordcount12
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
 * @author shkstart
 * @create 2020-09-06 18:55
 */
object countByValue {
  def main(args: Array[String]): Unit = {
    val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("countByValue")
    val sc = new SparkContext(sparkconf)
    val rdd: RDD[String] = sc.textFile("data/word.txt")
    val flatRDD: RDD[String] = rdd.flatMap(_.split(" "))
    flatRDD.countByValue().foreach(println)
    sc.stop()
  }
}

 

 

//TODO 第七种 : fold
package com.bigdata.spark.core.wordcount12
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
 * @author shkstart
 * @create 2020-09-06 23:34
 */
object fold {
  def main(args: Array[String]): Unit = {
    val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("fold")
    val sc = new SparkContext(sparkconf)
    val rdd: RDD[String] = sc.textFile("data/word.txt")
    val flatRDD: RDD[String] = rdd.flatMap(_.split(" "))
    val mapRDD: RDD[Map[String, Int]] = flatRDD.map(str => {
      Map[String, Int](str->1)
    })
    val stringToInt: Map[String, Int] = mapRDD.fold(Map[String, Int]())((map1, map2) => {
      //( map1 /: map2 ) { case (map, (k,v)) => map + ( k -> (v + map.getOrElse(k, 0)) ) }
      //Map1{(),(),()}
      //Map2{(),(),()}
      map1.foldLeft(map2) {
        case (map, (k, v)) => {
          val newv: Int = map.getOrElse(k, 0) + v
          map.updated(k, newv)
        }
      }
    })
    println(stringToInt)
    sc.stop()
  }
}

 

 

//TODO 第八种 : foldByKey 
package com.bigdata.spark.core.wordcount12
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
 * @author shkstart
 * @create 2020-09-06 18:45
 */
object foldByKey {
  def main(args: Array[String]): Unit = {
    val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("foldByKey")
    val sc = new SparkContext(sparkconf)
    val rdd: RDD[String] = sc.textFile("data/word.txt")
    val flatRDD: RDD[String] = rdd.flatMap(_.split(" "))
    val map1RDD: RDD[(String, Int)] = flatRDD.map((_,1))
    val foldRDD: RDD[(String, Int)] = map1RDD.foldByKey(0)(_+_)
    foldRDD.collect.foreach(println)
    sc.stop()
  }
}

 

 

//TODO 第九种 : groupBy
package com.bigdata.spark.core.wordcount12
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
 * @author shkstart
 * @create 2020-09-06 18:18
 */
object groupBy {
  def main(args: Array[String]): Unit = {
    val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("groupBy")
    val sc = new SparkContext(sparkconf)
    val rdd: RDD[String] = sc.textFile("data/word.txt")
    val flatRDD: RDD[String] = rdd.flatMap(_.split(" "))
    val groupRDD: RDD[(String, Iterable[String])] = flatRDD.groupBy(word=>word)
    val mapRDD: RDD[(String, Int)] = groupRDD.map(kv =>
      (kv._1, kv._2.size)
    )
    mapRDD.collect.foreach(println)
    sc.stop()
  }
}

 

 

//TODO 第十种 : groupByKey
package com.bigdata.spark.core.wordcount12
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
 * @author shkstart
 * @create 2020-09-06 18:39
 */
object groupByKey {
  def main(args: Array[String]): Unit = {
    val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("groupByKey" )
    val sc = new SparkContext(sparkconf)
    val rdd: RDD[String] = sc.textFile("data/word.txt")
    val flatRDD: RDD[String] = rdd.flatMap(_.split(" "))
    val mapRDD: RDD[(String, Int)] = flatRDD.map((_,1))
    val groupRDD: RDD[(String, Iterable[Int])] = mapRDD.groupByKey()
    val map1RDD: RDD[(String, Int)] = groupRDD.map(kv=>(kv._1,kv._2.size))
    map1RDD.collect.foreach(println)
    sc.stop()
  }
}

 

 

//TODO 第十一种 : reduce
package com.bigdata.spark.core.wordcount12
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
/**
 * @author shkstart
 * @create 2020-09-07 0:01
 */
object reduce {
  def main(args: Array[String]): Unit = {
    val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("reduce" )
    val sc = new SparkContext(sparkconf)
    val rdd: RDD[String] = sc.textFile("data/word.txt")
    val flatRDD: RDD[String] = rdd.flatMap(_.split(" "))
    val mapRDD: RDD[Map[String, Int]] = flatRDD.map(str => {
      Map[String, Int](str->1)
    })
    val stringToInt: Map[String, Int] = mapRDD.reduce((map1, map2) => {
      //Map1{(),(),()}
      //Map2{(),(),()}
      map1.foldLeft(map2) {
        case (map, (k, v)) => {
          val newv: Int = map.getOrElse(k, 0) + v
          map.updated(k, newv)
        }
      }
    })
    println(stringToInt)
  }
}

 

 

//TODO 第十二种 : reduceByKey
package com.bigdata.spark.core.wordcount12
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
 * @author shkstart
 * @create 2020-09-06 18:33
 */
object reduceByKey {
  def main(args: Array[String]): Unit = {
    val sparkconf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("reduceByKey")
    val sc = new SparkContext(sparkconf)
    val rdd: RDD[String] = sc.textFile("data/word.txt")
    val flatRDD: RDD[String] = rdd.flatMap(_.split(" "))
    val mapRDD: RDD[(String, Int)] = flatRDD.map((_,1))
    val reduceByKey: RDD[(String, Int)] = mapRDD.reduceByKey(_+_)
    reduceByKey.collect.foreach(println)
    sc.stop()
  }
}
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值