import scala.collection.Iterator
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkException, SparkContext, SparkConf}
import scala.collection.mutable.ListBuffer
object Ex3_CombiningRDDs {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Ex3_CombiningRDDs").setMaster("local[4]")
val sc = new SparkContext(conf)
// put some data in an RDD
val letters = sc.parallelize('a' to 'z', 8)
// another RDD of the same type
val vowels = sc.parallelize(Seq('a', 'e', 'i', 'o', 'u'), 4)
// subtract one from another, getting yet another RDD of the same type
val consonants = letters.subtract(vowels)
println("There are " + consonants.count() + " consonants")
val vowelsNotLetters = vowels.subtract(letters)
println("There are " + vowelsNotLetters.count() + " vowels that aren't letters")
// union
val lettersAgain = consonants ++ vowels
println("There really are " + lettersAgain.count() + " letters")
// union with duplicates, removed
val tooManyVowels = vowels ++ vowels
println("There aren't really " + tooManyVowels.count() + " vowels")
val justVowels = tooManyVowels.distinct()
println("There are actually " + justVowels.count() + " vowels")
// subtraction with duplicates
val what = tooManyVowels.subtract(vowels)
println("There are actually " + what.count() + " whats")
// intersection
val earlyLetters = sc.parallelize('a' to 'l', 2)
val earlyVowels = earlyLetters.intersection(vowels)
println("The early vowels:")
earlyVowels.foreach(println)
// RDD of a different type
val numbers = sc.parallelize(1 to 2, 2)
// cartesian product
val cp = vowels.cartesian(numbers)
println("Product has " + cp.count() + " elements")
// index the letters
val indexed = letters.zipWithIndex()
println("indexed letters")
indexed foreach {
case (c, i) => println(i + ": " + c)
}
// another RDD, same size and partitioning as letters
val twentySix = sc.parallelize(101 to 126, 8)
// zip the letters and numbers
val differentlyIndexed = letters.zip(twentySix)
differentlyIndexed foreach {
case (c, i) => println(i + ": " + c)
}
// we can't do this if the two RDDs don't have the same partitioning --
// this is to remind us that it would be enormously costly in terms
// of communication, so, as we'll see in later examples, we have to
// fix the partitioning ourselves
val twentySixBadPart = sc.parallelize(101 to 126, 3)
val cantGet = letters.zip(twentySixBadPart)
try {
cantGet foreach {
case (c, i) => println(i + ": " + c)
}
} catch {
case iae: IllegalArgumentException =>
println("Exception caught: " + iae.getMessage)
}
// the zipped RDDs also need to have the same number of elements
val unequalCount = earlyLetters.zip(numbers)
try {
unequalCount foreach {
case (c, i) => println(i + ": " + c)
}
}
catch {
case se: SparkException => {
val t = se.getMessage
println("Exception caught: " + se.getMessage)
}
}
// zipPartitions gives us more control, se we can deal with weird cases
// BUT the result may be surprising because each PARTITION also has
// unequal numbers of elements, and the function 'zipFunc' gets
// applied once per partition!
// also notice the amount of type annotation to make the Scala compiler
// happy -- it's an interesting exercise to remove some of them and read
// the complaints
def zipFunc(lIter: Iterator[Char], nIter: Iterator[Int]) :
Iterator[(Char, Int)] = {
val res = new ListBuffer[(Char, Int)]
while (lIter.hasNext || nIter.hasNext) {
if (lIter.hasNext && nIter.hasNext) {
// easy case
res += ((lIter.next(), nIter.next()))
} else if (lIter.hasNext) {
res += ((lIter.next(), 0))
} else if (nIter.hasNext) {
res += ((' ', nIter.next()))
}
}
res.iterator
}
val unequalOK = earlyLetters.zipPartitions(numbers)(zipFunc)
println("this may not be what you expected with unequal length RDDs")
unequalOK foreach {
case (c, i) => println(i + ": " + c)
}
}
}
3. 优化后的代码+注释
import org.apache.spark.{SparkConf, SparkContext, SparkException}
import scala.collection.Iterator
import scala.collection.mutable.ListBuffer
object Ex3_CombiningRDDs_FIX {
def main(args: Array[String]) {
// 下面这些是之前几篇中提到过的,不再专门提示
// 1. 注意每行代码的长度
// 2. 注意RDD、DataFrame等Spark分布式数据集的变量命名,要在后面加RDD、DF
// 3. 注意多次使用后要缓存
// 4. 注意函数式编程的写法
val conf = new SparkConf().setAppName("Ex3_CombiningRDDs").setMaster("local[4]")
val sc = new SparkContext(conf)
// put some data in an RDD
val letterRDD = sc.parallelize('a' to 'z', 8)
letterRDD.persist()
// another RDD of the same type
val vowelRDD = sc.parallelize(Seq('a', 'e', 'i', 'o', 'u'), 4)
vowelRDD.persist()
// subtract one from another, getting yet another RDD of the same type
val consonantsRDD = letterRDD.subtract(vowelRDD)
consonantsRDD.persist()
println("There are " + consonantsRDD.count() + " consonantsRDD")
val vowelsNotLetterRDD = vowelRDD.subtract(letterRDD)
println("There are " + vowelsNotLetterRDD.count() + " vowelRDD that aren't letterRDD")
// union
val lettersAgainRDD = consonantsRDD ++ vowelRDD
println("There really are " + lettersAgainRDD.count() + " letterRDD")
// 缓存不再使用了后,要解除缓存
consonantsRDD.unpersist()
// union with duplicates, removed
val tooManyVowelRDD = vowelRDD ++ vowelRDD
tooManyVowelRDD.persist()
println("There aren't really " + tooManyVowelRDD.count() + " vowelRDD")
val justVowelRDD = tooManyVowelRDD.distinct()
println("There are actually " + justVowelRDD.count() + " vowelRDD")
// subtraction with duplicates
val whatRDD = tooManyVowelRDD.subtract(vowelRDD)
println("There are actually " + whatRDD.count() + " whats")
// 缓存不再使用了后,要解除缓存
tooManyVowelRDD.unpersist()
// intersection
val earlyLetterRDD = sc.parallelize('a' to 'l', 2)
val earlyVowelRDD = earlyLetterRDD.intersection(vowelRDD)
println("The early vowelRDD:")
earlyVowelRDD.foreach(println)
// RDD of a different type
val numberRDD = sc.parallelize(1 to 2, 2)
numberRDD.persist()
// cartesian product
val cpRDD = vowelRDD.cartesian(numberRDD)
println("Product has " + cpRDD.count() + " elements")
// 缓存不再使用了后,要解除缓存
vowelRDD.unpersist()
// index the letterRDD
val indexed = letterRDD.zipWithIndex()
println("indexed letterRDD")
// indexed foreach {
// case (c, i) => println(i + ": " + c)
// }
// 如果想在集群上看监控日志,查找该打印,那么上面这样写没问题
// 但如果需要在本地打印时,一定要先将数据拿到本地,例如take、collect
// foreach是Action算子,会触发Job,但是会在每个Executor上分别执行里面的打印,打印不会显示到本机
// 在这里,你直接在本地运行,会显示打印,原因是数据计算就在本地,打印代码执行也在本地,当然也会打印到本地控制台
indexed.collect()
.foreach {
case (letter, index) => println(index + ": " + letter)
}
// 另外你还可以尝试这种写法
// indexed.collect().foreach(charIdx => println(charIdx.swap))
// another RDD, same size and partitioning as letterRDD
val twentySix = sc.parallelize(101 to 126, 8)
// zip the letterRDD and numberRDD
val differentlyIndexedRDD = letterRDD.zip(twentySix)
// differentlyIndexedRDD foreach {
// case (c, i) => println(i + ": " + c)
// }
// 试试下面的写法
differentlyIndexedRDD.collect().foreach(charIdx => println(charIdx.swap))
// we can't do this if the two RDDs don't have the same partitioning --
// this is to remind us that it would be enormously costly in terms
// of communication, so, as we'll see in later examples, we have to
// fix the partitioning ourselves
val twentySixBadPartRDD = sc.parallelize(101 to 126, 3)
val cantGet = letterRDD.zip(twentySixBadPartRDD)
try {
// cantGet foreach {
// case (c, i) => println(i + ": " + c)
// }
// 试试下面的写法
cantGet.collect().foreach(charIdx => println(charIdx.swap))
} catch {
case iae: IllegalArgumentException =>
println("Exception caught: " + iae.getMessage)
}
// 缓存不再使用了后,要解除缓存
letterRDD.unpersist()
// the zipped RDDs also need to have the same number of elements
val unequalCount = earlyLetterRDD.zip(numberRDD)
try {
// unequalCount foreach {
// case (c, i) => println(i + ": " + c)
// }
// 试试下面的写法
unequalCount.collect().foreach(charIdx => println(charIdx.swap))
} catch {
// case se: SparkException => {
// val t = se.getMessage
// println("Exception caught: " + se.getMessage)
// }
// 移除无意义的代码
case se: SparkException =>
println("Exception caught: " + se.getMessage)
}
// zipPartitions gives us more control, se we can deal with weird cases
// BUT the result may be surprising because each PARTITION also has
// unequal numberRDD of elements, and the function 'zipFunc' gets
// applied once per partition!
// also notice the amount of type annotation to make the Scala compiler
// happy -- it's an interesting exercise to remove some of them and read
// the complaints
// def zipFunc(lIter: Iterator[Char], nIter: Iterator[Int]):
// Iterator[(Char, Int)] = {
// val res = new ListBuffer[(Char, Int)]
// while (lIter.hasNext || nIter.hasNext) {
// if (lIter.hasNext && nIter.hasNext) {
// // easy case
// res += ((lIter.next(), nIter.next()))
// } else if (lIter.hasNext) {
// res += ((lIter.next(), 0))
// } else if (nIter.hasNext) {
// res += ((' ', nIter.next()))
// }
// }
// res.iterator
// }
def zipFunc(lIter: Iterator[Char], nIter: Iterator[Int]): Iterator[(Char, Int)] = {
val res = new ListBuffer[(Char, Int)]
// 使用匹配模式,更易于理解
while (lIter.hasNext || nIter.hasNext) {
(lIter.hasNext, nIter.hasNext) match {
case (true, true) => res += ((lIter.next(), nIter.next()))
case (true, false) => res += ((lIter.next(), 0))
case (false, true) => res += ((' ', nIter.next()))
case (false, false) =>
}
}
res.iterator
}
val unequalOKRDD = earlyLetterRDD.zipPartitions(numberRDD)(zipFunc)
println("this may not be whatRDD you expected with unequal length RDDs")
// unequalOKRDD foreach {
// case (c, i) => println(i + ": " + c)
// }
// 试试下面的写法
unequalOKRDD.collect().foreach(charIdx => println(charIdx.swap))
sc.stop()
}
}