spark实现word count的各种写法

package com.xxx.proj

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable

object Test16 {
  def main(args: Array[String]): Unit = {
    //    wordCount1()
    //    wordCount2()
    //    wordCount3()
    //    wordCount4()
    //    wordCount5()
    //    wordCount6()
    //    wordCount7()
    //    wordCount8()
    wordCount9()

  }

  //groupBy
  def wordCount1(): Unit = {
    val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("wordCount")
    val sparkContext = new SparkContext(sparkConf)
    val rdd: RDD[String] = sparkContext.makeRDD(List("hello java", "hello spark", "hello scala"))
    val words: RDD[String] = rdd.flatMap(_.split(" "))
    val group: RDD[(String, Iterable[String])] = words.groupBy(w => w)
    val wordCount: RDD[(String, Int)] = group.mapValues(iter => iter.size)
    wordCount.collect().foreach(println)
  }

  //groupByKey
  def wordCount2(): Unit = {
    val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("wordCount")
    val sparkContext = new SparkContext(sparkConf)
    val rdd: RDD[String] = sparkContext.makeRDD(List("hello java", "hello spark", "hello scala"))
    val words: RDD[String] = rdd.flatMap(_.split(" "))
    val wordOne: RDD[(String, Int)] = words.map((_, 1))
    val group: RDD[(String, Iterable[Int])] = wordOne.groupByKey()
    val wordCount: RDD[(String, Int)] = group.mapValues(iter => iter.size)
    wordCount.collect().foreach(println)
  }

  //reduceByKey
  def wordCount3(): Unit = {
    val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("wordCount")
    val sparkContext = new SparkContext(sparkConf)
    val rdd: RDD[String] = sparkContext.makeRDD(List("hello java", "hello spark", "hello scala"))
    val words: RDD[String] = rdd.flatMap(_.split(" "))
    val wordOne: RDD[(String, Int)] = words.map((_, 1))
    val wordCount: RDD[(String, Int)] = wordOne.reduceByKey(_ + _)
    wordCount.collect().foreach(println)
  }

  //aggregateByKey
  def wordCount4(): Unit = {
    val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("wordCount")
    val sparkContext = new SparkContext(sparkConf)
    val rdd: RDD[String] = sparkContext.makeRDD(List("hello java", "hello spark", "hello scala"))
    val words: RDD[String] = rdd.flatMap(_.split(" "))
    val wordOne: RDD[(String, Int)] = words.map((_, 1))
    val wordCount: RDD[(String, Int)] = wordOne.aggregateByKey(0)(_ + _, _ + _)
    wordCount.collect().foreach(println)
  }

  //foldByKey
  def wordCount5(): Unit = {
    val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("wordCount")
    val sparkContext = new SparkContext(sparkConf)
    val rdd: RDD[String] = sparkContext.makeRDD(List("hello java", "hello spark", "hello scala"))
    val words: RDD[String] = rdd.flatMap(_.split(" "))
    val wordOne: RDD[(String, Int)] = words.map((_, 1))
    val wordCount: RDD[(String, Int)] = wordOne.foldByKey(0)(_ + _)
    wordCount.collect().foreach(println)
  }

  //combineByKey
  def wordCount6(): Unit = {
    val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("wordCount")
    val sparkContext = new SparkContext(sparkConf)
    val rdd: RDD[String] = sparkContext.makeRDD(List("hello java", "hello spark", "hello scala"))
    val words: RDD[String] = rdd.flatMap(_.split(" "))
    val wordOne: RDD[(String, Int)] = words.map((_, 1))
    val wordCount: RDD[(String, Int)] = wordOne.combineByKey(w => w, _ + _, _ + _)
    wordCount.collect().foreach(println)
  }

  //countByKey
  def wordCount7(): Unit = {
    val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("wordCount")
    val sparkContext = new SparkContext(sparkConf)
    val rdd: RDD[String] = sparkContext.makeRDD(List("hello java", "hello spark", "hello scala"))
    val words: RDD[String] = rdd.flatMap(_.split(" "))
    val wordOne: RDD[(String, Int)] = words.map((_, 1))
    val wordCount: collection.Map[String, Long] = wordOne.countByKey()
    wordCount.foreach(println)
  }

  //countByValue
  def wordCount8(): Unit = {
    val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("wordCount")
    val sparkContext = new SparkContext(sparkConf)
    val rdd: RDD[String] = sparkContext.makeRDD(List("hello java", "hello spark", "hello scala"))
    val words: RDD[String] = rdd.flatMap(_.split(" "))
    val wordCount: collection.Map[String, Long] = words.countByValue()
    wordCount.foreach(println)
  }

  //reduce
  def wordCount9(): Unit = {
    val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("wordCount")
    val sparkContext = new SparkContext(sparkConf)
    val rdd: RDD[String] = sparkContext.makeRDD(List("hello java", "hello spark", "hello scala"))
    val words: RDD[String] = rdd.flatMap(_.split(" "))

    val wordMap: RDD[mutable.Map[String, Int]] = words.map(w => {
      mutable.Map[String, Int]((w, 1))
    })
    val wordCount: mutable.Map[String, Int] = wordMap.reduce((map1, map2) => {
      map2.foreach {
        case (word, count) => {
          var newCount = map1.getOrElse(word, 0) + count
          map1.update(word, newCount)
        }
      }
      map1
    })

    println(wordCount)
  }

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值