package com.xxx.proj
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
object Test16 {
def main(args: Array[String]): Unit = {
// wordCount1()
// wordCount2()
// wordCount3()
// wordCount4()
// wordCount5()
// wordCount6()
// wordCount7()
// wordCount8()
wordCount9()
}
//groupBy
def wordCount1(): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("wordCount")
val sparkContext = new SparkContext(sparkConf)
val rdd: RDD[String] = sparkContext.makeRDD(List("hello java", "hello spark", "hello scala"))
val words: RDD[String] = rdd.flatMap(_.split(" "))
val group: RDD[(String, Iterable[String])] = words.groupBy(w => w)
val wordCount: RDD[(String, Int)] = group.mapValues(iter => iter.size)
wordCount.collect().foreach(println)
}
//groupByKey
def wordCount2(): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("wordCount")
val sparkContext = new SparkContext(sparkConf)
val rdd: RDD[String] = sparkContext.makeRDD(List("hello java", "hello spark", "hello scala"))
val words: RDD[String] = rdd.flatMap(_.split(" "))
val wordOne: RDD[(String, Int)] = words.map((_, 1))
val group: RDD[(String, Iterable[Int])] = wordOne.groupByKey()
val wordCount: RDD[(String, Int)] = group.mapValues(iter => iter.size)
wordCount.collect().foreach(println)
}
//reduceByKey
def wordCount3(): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("wordCount")
val sparkContext = new SparkContext(sparkConf)
val rdd: RDD[String] = sparkContext.makeRDD(List("hello java", "hello spark", "hello scala"))
val words: RDD[String] = rdd.flatMap(_.split(" "))
val wordOne: RDD[(String, Int)] = words.map((_, 1))
val wordCount: RDD[(String, Int)] = wordOne.reduceByKey(_ + _)
wordCount.collect().foreach(println)
}
//aggregateByKey
def wordCount4(): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("wordCount")
val sparkContext = new SparkContext(sparkConf)
val rdd: RDD[String] = sparkContext.makeRDD(List("hello java", "hello spark", "hello scala"))
val words: RDD[String] = rdd.flatMap(_.split(" "))
val wordOne: RDD[(String, Int)] = words.map((_, 1))
val wordCount: RDD[(String, Int)] = wordOne.aggregateByKey(0)(_ + _, _ + _)
wordCount.collect().foreach(println)
}
//foldByKey
def wordCount5(): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("wordCount")
val sparkContext = new SparkContext(sparkConf)
val rdd: RDD[String] = sparkContext.makeRDD(List("hello java", "hello spark", "hello scala"))
val words: RDD[String] = rdd.flatMap(_.split(" "))
val wordOne: RDD[(String, Int)] = words.map((_, 1))
val wordCount: RDD[(String, Int)] = wordOne.foldByKey(0)(_ + _)
wordCount.collect().foreach(println)
}
//combineByKey
def wordCount6(): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("wordCount")
val sparkContext = new SparkContext(sparkConf)
val rdd: RDD[String] = sparkContext.makeRDD(List("hello java", "hello spark", "hello scala"))
val words: RDD[String] = rdd.flatMap(_.split(" "))
val wordOne: RDD[(String, Int)] = words.map((_, 1))
val wordCount: RDD[(String, Int)] = wordOne.combineByKey(w => w, _ + _, _ + _)
wordCount.collect().foreach(println)
}
//countByKey
def wordCount7(): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("wordCount")
val sparkContext = new SparkContext(sparkConf)
val rdd: RDD[String] = sparkContext.makeRDD(List("hello java", "hello spark", "hello scala"))
val words: RDD[String] = rdd.flatMap(_.split(" "))
val wordOne: RDD[(String, Int)] = words.map((_, 1))
val wordCount: collection.Map[String, Long] = wordOne.countByKey()
wordCount.foreach(println)
}
//countByValue
def wordCount8(): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("wordCount")
val sparkContext = new SparkContext(sparkConf)
val rdd: RDD[String] = sparkContext.makeRDD(List("hello java", "hello spark", "hello scala"))
val words: RDD[String] = rdd.flatMap(_.split(" "))
val wordCount: collection.Map[String, Long] = words.countByValue()
wordCount.foreach(println)
}
//reduce
def wordCount9(): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("wordCount")
val sparkContext = new SparkContext(sparkConf)
val rdd: RDD[String] = sparkContext.makeRDD(List("hello java", "hello spark", "hello scala"))
val words: RDD[String] = rdd.flatMap(_.split(" "))
val wordMap: RDD[mutable.Map[String, Int]] = words.map(w => {
mutable.Map[String, Int]((w, 1))
})
val wordCount: mutable.Map[String, Int] = wordMap.reduce((map1, map2) => {
map2.foreach {
case (word, count) => {
var newCount = map1.getOrElse(word, 0) + count
map1.update(word, newCount)
}
}
map1
})
println(wordCount)
}
}
spark实现word count的各种写法
最新推荐文章于 2023-01-14 19:55:05 发布