Spark之wordCount合集

1、groupBy

按照单词分组,同一个单词就会在一个组。

mapValue根据单值(和键值对的value无关)计算出这个组的单词个数==>每个单词出现个数

  def sparkWordCount1(sc: SparkContext) = {
    val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark", "Hello Scala", "Hive Spark Java"))
    val words: RDD[String] = rdd.flatMap(
      line => line.split(" ")
    )
    val group: RDD[(String, Iterable[String])] = words.groupBy(word => word)
    val wordCount: RDD[(String, Int)] = group.mapValues(iter => (iter.size))
    wordCount.collect().foreach(println)
  }

2、groupByKey

map => 转换结构 => "word"=> ("word",1)

reduceByKey根据key分组 

需要经过shuffle 效率低,推荐使用groupBy

  def sparkWordCount2(sc: SparkContext) = {
    val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark", "Hello Scala", "Hive Spark Java"))
    val words: RDD[String] = rdd.flatMap(
      line => line.split(" ")
    )
    val wordOne: RDD[(String, Int)] = words.map(word => (word, 1))
    val group: RDD[(String, Iterable[Int])] = wordOne.groupByKey()
    val wordCount: RDD[(String, Int)] = group.mapValues(iter => iter.size)
    wordCount.collect().foreach(println)
  }

3、reduceByKey

map => 转换结构 => "word"=> ("word",1)

reduceByKey根据key直接聚合

  def sparkWordCount3(sc: SparkContext) = {
    val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark", "Hello Scala", "Hive Spark Java"))
    val words: RDD[String] = rdd.flatMap(
      line => line.split(" ")
    )
    val wordOne: RDD[(String, Int)] = words.map(word => (word, 1))
    val wordCount: RDD[(String, Int)] = wordOne.reduceByKey(_ + _)
    wordCount.collect().foreach(println)
  }

4、aggregateByKey

aggregateByKey(初始值)(组内计算规则,组间计算规则)

  def sparkWordCount4(sc: SparkContext) = {
    val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark", "Hello Scala", "Hive Spark Java"))
    val words: RDD[String] = rdd.flatMap(
      line => line.split(" ")
    )
    val wordOne: RDD[(String, Int)] = words.map(word => (word, 1))
    val wordCount: RDD[(String, Int)] = wordOne.aggregateByKey(0)(_ + _, _ + _)
    wordCount.collect().foreach(println)
  }

5、foldByKey

foldByKey(初始值)(组内组间计算规则)

组内组间计算规则相同时可使用

  def sparkWordCount5(sc: SparkContext) = {
    val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark", "Hello Scala", "Hive Spark Java"))
    val words: RDD[String] = rdd.flatMap(
      line => line.split(" ")
    )
    val wordOne: RDD[(String, Int)] = words.map(word => (word, 1))
    val wordCount: RDD[(String, Int)] = wordOne.foldByKey(0)(_ + _)
    wordCount.collect().foreach(println)
  }

6、combineByKey

combineByKey(第一个传进来的value怎么处理?)(组内计算规则,组间计算规则)

  def sparkWordCount6(sc: SparkContext) = {
    val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark", "Hello Scala", "Hive Spark Java"))
    val words: RDD[String] = rdd.flatMap(
      line => line.split(" ")
    )
    val wordOne: RDD[(String, Int)] = words.map(word => (word, 1))
    val wordCount: RDD[(String, Int)] = wordOne.combineByKey(
      value => value,
      (x: Int, y: Int) => x + y,
      (x: Int, y: Int) => x + y
    )
    wordCount.collect().foreach(println)
  }

7、countBykey

直接计算这个key的value有多少个,得到的是map

  def sparkWordCount7(sc: SparkContext) = {
    val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark", "Hello Scala", "Hive Spark Java"))
    val words: RDD[String] = rdd.flatMap(
      line => line.split(" ")
    )
    val wordOne: RDD[(String, Int)] = words.map(word => (word, 1))
    val wordCount: collection.Map[String, Long] = wordOne.countByKey()
    wordCount.foreach(println)
  }

8、countByValue

countByValue计算的是("word",1)整体出现了多少次,与键值对的value无关

  def sparkWordCount8(sc: SparkContext) = {
    val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark", "Hello Scala", "Hive Spark Java"))
    val words: RDD[String] = rdd.flatMap(
      line => line.split(" ")
    )
    val wordOne: RDD[(String, Int)] = words.map(word => (word, 1))
    val wordCount: collection.Map[(String, Int), Long] = wordOne.countByValue()
    wordCount.foreach(println)
  }

9、reduce

A.reduce(x:A的类型,y:A的类型) : A的类型 ={}


  def sparkWordCount9(sc: SparkContext) = {
    val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark", "Hello Scala", "Hive Spark Java"))
    val words: RDD[String] = rdd.flatMap(
      line => line.split(" ")
    )
    //map()方法转换数据结构
    //转换为Map比较好操作
    //拥有foreach和getOrElse方法
    val mapWord: RDD[mutable.Map[String, Long]] = words.map(
      word =>
        mutable.Map[String, Long]((word, 1))
    )
    val wordCount: mutable.Map[String, Long] = mapWord.reduce(
      (map1, map2) => {
        map2.foreach {
          case (word, count) => {
            val newCount = map1.getOrElse(word, 0L) + count
            //getOrElse(key,default)
            //如果有key那就get(key),如果没有,就返回default
            map1.update(word, newCount)
          }
        }
        map1
      }
    )
    println(wordCount)
  }

10、aggregate

def sparkWordCount10(sc: SparkContext) = {
    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName("wordCount")
    val sc = new SparkContext(conf)
    val wordstext: RDD[String] = sc.textFile("datas/word.txt", 2)
    val words: RDD[String] = wordstext.flatMap(
      line => line.split(" ")
    )
    val wordOne: RDD[mutable.Map[String, Int]] = words.map(
      word => mutable.Map((word, 1))
    )
    val wordCount: mutable.Map[String, Int] = wordOne.aggregate(mutable.Map[String, Int]())(
      (map1, map2) => {
        map2.foreach {
          case (word, count) => {
            val newCount = map1.getOrElse(word, 0) + count
            map1.update(word, newCount)
          }
        }
        map1
      },
      (map3, map4) => {
        map4.foreach {
          case (word, count) => {
            val newCount: Int = map3.getOrElse(word, 0) + count
            map3.update(word, newCount)
          }
        }
        map3
      }
    )
    println(wordCount)
  }

11、fold

 def sparkWordCount11(sc: SparkContext) = {
    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName("wordCount")
    val sc = new SparkContext(conf)
    val wordstext: RDD[String] = sc.textFile("datas/word.txt", 2)
    val words: RDD[String] = wordstext.flatMap(
      line => line.split(" ")
    )
    val wordOne: RDD[mutable.Map[String, Int]] = words.map(
      word => mutable.Map((word, 1))
    )
    val wordCount: mutable.Map[String, Int] = wordOne.fold(mutable.Map[String, Int]())(
      (map1, map2) => {
        map2.foreach {
          case (word, count) => {
            val newCount = map1.getOrElse(word, 0) + count
            map1.update(word, newCount)
          }
        }
        map1
      }
    )
    println(wordCount)
  }

1、aggregate和fold方法主要是初始值类型的问题,初始值类型确定,后面的计算规则的参数类型也确定

        fold(mutable.Map[String, Int]())(组内计算规则,组间计算规则)

2、map()方法经常用,主要是用来改变数据的结构,改成你想要的样子

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值