//方式4 map + aggregateByKey
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
object WordCount04 {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("aggregateByKey")
.master("local[2]")
.getOrCreate()
val sc = spark.sparkContext
val lines:RDD[String] = sc.textFile("data/thatgirl.txt")
//扁平化操作,拆分数据为 一个单词一行
val word:RDD[String] = lines.flatMap(_.split(" "))
//map转换为 (key,1)
val mapRDD:RDD[(String,Int)] = word.map((_, 1))
//zeroValue为指定的初始值,所有的数据都会和这个值进行对比操作
val res:RDD[(String,Int)] = mapRDD.aggregateByKey(0)(_ + _ , _ + _)
res.collect.foreach(println)
sc.stop()
}
}
//方式5 map + foldByKey
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
object WordCount05 {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("foldByKey")
.master("local[2]")
.getOrCreate()
val sc = spark.sparkContext
val lines: RDD[String] = sc.textFile("data/thatgirl.txt")
//扁平化操作,拆分数据为 一个单词一行
val word: RDD[String] = lines.flatMap(_.split(" "))
//map转换为 (key,1)
val mapRDD: RDD[(String, Int)] = word.map((_, 1))
//当aggregateByKey的分区内和分区外的逻辑一致时,就可以简写成为foldByKey
val res: RDD[(String, Int)] = mapRDD.foldByKey(0)(_ + _)
res.collect.foreach(println)
sc.stop()
}
}
//方式6 map + combineByKey
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
object WordCount06 {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("combineByKey")
.master("local[2]")
.getOrCreate()
val sc = spark.sparkContext
val lines: RDD[String] = sc.textFile("data/thatgirl.txt")
//扁平化操作,拆分数据为 一个单词一行
val word: RDD[String] = lines.flatMap(_.split(" "))
//map转换为 (key,1)
val mapRDD: RDD[(String, Int)] = word.map((_, 1))
//combineByKey()的功能类似aggregateByKey()
//分为3个参数,第一个是对分区内第一个值进行操作
//第二个是分区内操作逻辑,第三个是分区间操作逻辑
//但combineByKey()允许用户返回值的类型与输入不一样
val res: RDD[(String, Int)] = mapRDD.combineByKey(
v => v,
(x,y) => x+y,
(x,y) => x+y
)
res.collect.foreach(println)
sc.stop()
}
}