1、groupBy
按照单词分组,同一个单词就会在一个组。
mapValue根据单值(和键值对的value无关)计算出这个组的单词个数==>每个单词出现个数
def sparkWordCount1(sc: SparkContext) = {
val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark", "Hello Scala", "Hive Spark Java"))
val words: RDD[String] = rdd.flatMap(
line => line.split(" ")
)
val group: RDD[(String, Iterable[String])] = words.groupBy(word => word)
val wordCount: RDD[(String, Int)] = group.mapValues(iter => (iter.size))
wordCount.collect().foreach(println)
}
2、groupByKey
map => 转换结构 => "word"=> ("word",1)
reduceByKey根据key分组
需要经过shuffle 效率低,推荐使用groupBy
def sparkWordCount2(sc: SparkContext) = {
val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark", "Hello Scala", "Hive Spark Java"))
val words: RDD[String] = rdd.flatMap(
line => line.split(" ")
)
val wordOne: RDD[(String, Int)] = words.map(word => (word, 1))
val group: RDD[(String, Iterable[Int])] = wordOne.groupByKey()
val wordCount: RDD[(String, Int)] = group.mapValues(iter => iter.size)
wordCount.collect().foreach(println)
}
3、reduceByKey
map => 转换结构 => "word"=> ("word",1)
reduceByKey根据key直接聚合
def sparkWordCount3(sc: SparkContext) = {
val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark", "Hello Scala", "Hive Spark Java"))
val words: RDD[String] = rdd.flatMap(
line => line.split(" ")
)
val wordOne: RDD[(String, Int)] = words.map(word => (word, 1))
val wordCount: RDD[(String, Int)] = wordOne.reduceByKey(_ + _)
wordCount.collect().foreach(println)
}
4、aggregateByKey
aggregateByKey(初始值)(组内计算规则,组间计算规则)
def sparkWordCount4(sc: SparkContext) = {
val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark", "Hello Scala", "Hive Spark Java"))
val words: RDD[String] = rdd.flatMap(
line => line.split(" ")
)
val wordOne: RDD[(String, Int)] = words.map(word => (word, 1))
val wordCount: RDD[(String, Int)] = wordOne.aggregateByKey(0)(_ + _, _ + _)
wordCount.collect().foreach(println)
}
5、foldByKey
foldByKey(初始值)(组内组间计算规则)
组内组间计算规则相同时可使用
def sparkWordCount5(sc: SparkContext) = {
val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark", "Hello Scala", "Hive Spark Java"))
val words: RDD[String] = rdd.flatMap(
line => line.split(" ")
)
val wordOne: RDD[(String, Int)] = words.map(word => (word, 1))
val wordCount: RDD[(String, Int)] = wordOne.foldByKey(0)(_ + _)
wordCount.collect().foreach(println)
}
6、combineByKey
combineByKey(第一个传进来的value怎么处理?)(组内计算规则,组间计算规则)
def sparkWordCount6(sc: SparkContext) = {
val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark", "Hello Scala", "Hive Spark Java"))
val words: RDD[String] = rdd.flatMap(
line => line.split(" ")
)
val wordOne: RDD[(String, Int)] = words.map(word => (word, 1))
val wordCount: RDD[(String, Int)] = wordOne.combineByKey(
value => value,
(x: Int, y: Int) => x + y,
(x: Int, y: Int) => x + y
)
wordCount.collect().foreach(println)
}
7、countBykey
直接计算这个key的value有多少个,得到的是map
def sparkWordCount7(sc: SparkContext) = {
val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark", "Hello Scala", "Hive Spark Java"))
val words: RDD[String] = rdd.flatMap(
line => line.split(" ")
)
val wordOne: RDD[(String, Int)] = words.map(word => (word, 1))
val wordCount: collection.Map[String, Long] = wordOne.countByKey()
wordCount.foreach(println)
}
8、countByValue
countByValue计算的是("word",1)整体出现了多少次,与键值对的value无关
def sparkWordCount8(sc: SparkContext) = {
val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark", "Hello Scala", "Hive Spark Java"))
val words: RDD[String] = rdd.flatMap(
line => line.split(" ")
)
val wordOne: RDD[(String, Int)] = words.map(word => (word, 1))
val wordCount: collection.Map[(String, Int), Long] = wordOne.countByValue()
wordCount.foreach(println)
}
9、reduce
A.reduce(x:A的类型,y:A的类型) : A的类型 ={}
def sparkWordCount9(sc: SparkContext) = {
val rdd: RDD[String] = sc.makeRDD(List("Hello Scala", "Hello Spark", "Hello Scala", "Hive Spark Java"))
val words: RDD[String] = rdd.flatMap(
line => line.split(" ")
)
//map()方法转换数据结构
//转换为Map比较好操作
//拥有foreach和getOrElse方法
val mapWord: RDD[mutable.Map[String, Long]] = words.map(
word =>
mutable.Map[String, Long]((word, 1))
)
val wordCount: mutable.Map[String, Long] = mapWord.reduce(
(map1, map2) => {
map2.foreach {
case (word, count) => {
val newCount = map1.getOrElse(word, 0L) + count
//getOrElse(key,default)
//如果有key那就get(key),如果没有,就返回default
map1.update(word, newCount)
}
}
map1
}
)
println(wordCount)
}
10、aggregate
def sparkWordCount10(sc: SparkContext) = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("wordCount")
val sc = new SparkContext(conf)
val wordstext: RDD[String] = sc.textFile("datas/word.txt", 2)
val words: RDD[String] = wordstext.flatMap(
line => line.split(" ")
)
val wordOne: RDD[mutable.Map[String, Int]] = words.map(
word => mutable.Map((word, 1))
)
val wordCount: mutable.Map[String, Int] = wordOne.aggregate(mutable.Map[String, Int]())(
(map1, map2) => {
map2.foreach {
case (word, count) => {
val newCount = map1.getOrElse(word, 0) + count
map1.update(word, newCount)
}
}
map1
},
(map3, map4) => {
map4.foreach {
case (word, count) => {
val newCount: Int = map3.getOrElse(word, 0) + count
map3.update(word, newCount)
}
}
map3
}
)
println(wordCount)
}
11、fold
def sparkWordCount11(sc: SparkContext) = {
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("wordCount")
val sc = new SparkContext(conf)
val wordstext: RDD[String] = sc.textFile("datas/word.txt", 2)
val words: RDD[String] = wordstext.flatMap(
line => line.split(" ")
)
val wordOne: RDD[mutable.Map[String, Int]] = words.map(
word => mutable.Map((word, 1))
)
val wordCount: mutable.Map[String, Int] = wordOne.fold(mutable.Map[String, Int]())(
(map1, map2) => {
map2.foreach {
case (word, count) => {
val newCount = map1.getOrElse(word, 0) + count
map1.update(word, newCount)
}
}
map1
}
)
println(wordCount)
}
1、aggregate和fold方法主要是初始值类型的问题,初始值类型确定,后面的计算规则的参数类型也确定
fold(mutable.Map[String, Int]())(组内计算规则,组间计算规则)
2、map()方法经常用,主要是用来改变数据的结构,改成你想要的样子