import org.apache.spark.{SparkContext, SparkConf}
object Ex1_SimpleRDD {
def main (args: Array[String]) {
val conf = new SparkConf().setAppName("Ex1_SimpleRDD").setMaster("local[4]")
val sc = new SparkContext(conf)
// put some data in an RDD
val numbers = 1 to 10
val numbersRDD = sc.parallelize(numbers, 4)
println("Print each element of the original RDD")
numbersRDD.foreach(println)
// trivially operate on the numbers
val stillAnRDD = numbersRDD.map(n => n.toDouble / 10)
// get the data back out
val nowAnArray = stillAnRDD.collect()
// interesting how the array comes out sorted but the RDD didn't
println("Now print each element of the transformed array")
nowAnArray.foreach(println)
// explore RDD properties
val partitions = stillAnRDD.glom()
println("We _should_ have 4 partitions")
println(partitions.count())
partitions.foreach(a => {
println("Partition contents:" +
a.foldLeft("")((s, e) => s + " " + e))
})
}
}
3. 优化后的代码+注释
import org.apache.spark.{SparkConf, SparkContext}
object Ex1_SimpleRDD_FIX {
def main (args: Array[String]) {
// 此处conf的代码不长,可以按照此处的代码写到一行内
val conf = new SparkConf().setAppName("Ex1_SimpleRDD").setMaster("local[4]")
val sc = new SparkContext(conf)
// put some data in an RDD
val numbers = 1 to 10
val numbersRDD = sc.parallelize(numbers, 4)
println("Print each element of the original RDD")
numbersRDD.foreach(println)
// trivially operate on the numbers
// val stillAnRDD = numbersRDD.map(n => n.toDouble / 10)
// 可简写,只做推荐
// 熟悉scala语法后,强烈建议这样写
val stillAnRDD = numbersRDD.map(_.toDouble / 10)
// stillAnRDD后续被多次使用,请做缓存
stillAnRDD.persist()
// get the data back out
val nowAnArray = stillAnRDD.collect()
// interesting how the array comes out sorted but the RDD didn't
println("Now print each element of the transformed array")
nowAnArray.foreach(println)
// explore RDD properties
val partitions = stillAnRDD.glom()
// partitions后续被多次使用,请做缓存
partitions.persist()
println("We _should_ have 4 partitions")
println(partitions.count())
// partitions.foreach(a => {
// println("Partition contents:" +
// a.foldLeft("")((s, e) => s + " " + e))
// })
// 这样写更易懂
// mkString根据指定的分隔符将原容器的元素连接成字符串
// mkString是scala.collection.TraversableOnce的方法,只要实现了该trait,皆可使用该方法
// 变量命名应尽量易懂,尽量少使用单个字母,例如a,b。此处推荐:按类型命名用arr,按数据命名elements
partitions.foreach(arr => {
println("Partition contents: " + arr.mkString(" "))
})
// 一般情况下Spark应用会自动关闭,但用完最好自己手动马上关闭
sc.stop()
// 同时只在本地的操作可在手动关闭后做
// 这样可以减少集群资源的无意义占用
}
}