1.spark2.x内存模型
2.Shuffle的内存占用
Shuffle Read和Shuffle Write
3.性能优化
4.使用kryo序列化
package sparkCore
import org.apache.spark.rdd.RDD
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
object KryoDemo {
def main(args: Array[String]): Unit = {
// val spark = SparkSession
// .builder()
// .appName(this.getClass.getName)
// .master("local[2]")
// .config("spark.serializer",classOf[JavaSerializer].getName) //注册Kryo序列化器,可以不写,源码已默认设置
// .config("spark.kryo.classesToRegister","Fiction") //指定需要序列化的类,多个逗号分隔
// .getOrCreate()
//
// val sc = spark.sparkContext
val conf = new SparkConf()
.setAppName(this.getClass.getName)
.setMaster("local[2]")
.set("spark.serializer", classOf[KryoSerializer].getName) 注册Kryo序列化器,可以不写,源码已默认设置
.registerKryoClasses(Array(classOf[Fiction])) 指定需要序列化的类
val sc = new SparkContext(conf)
val lineRDD = sc.textFile("F:\\ideaProjects\\spark-version2\\src\\main\\resources\\booksInfo")
val fictionRDD: RDD[Fiction] = lineRDD.map {
line =>
val arr = line.split(" ")
val fiction = new Fiction(arr(0), arr(1))
fiction
}
println(fictionRDD.collect.toBuffer)
}
}
class Fiction(val name: String, val author: String) {
override def toString: String = s"书名:$name|作者:$author"
}
5.RDD持久化操作
5.1 cache、persist操作
package sparkCore.rddPersist
import org.apache.spark.sql.SparkSession
import org.apache.spark.storage.StorageLevel
object RDDPersistV1 {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName(this.getClass.getName)
.master("local[*]")
.getOrCreate()
val sc = spark.sparkContext
val rdd01 = sc.makeRDD(List("hello java", "hello scala", "hello spark"))
val rdd02 = rdd01.flatMap(x => {
println("flatMap..."); x.split(" ")
})
val rdd03 = rdd02.map(x => {
println("map..."); (x, 1)
})
//rdd03.cache() //persist(StorageLevel.MEMORY_ONLY)
//持久化到磁盘,这里是临时文件,作业即所有job完成后会删除
rdd03.persist(StorageLevel.DISK_ONLY)
val rdd04 = rdd03.reduceByKey(_ + _)
rdd04.collect.foreach(println(_))
println("*****************************")
//没有进行持久化操作前,这里rdd03对象被重用,由于RDD是不存储数据的,数据无法重用,
//只能再次重新执行一遍逻辑,获取数据,所有这里flatMap...和map...仍然会被输出
val rdd05 = rdd03.groupByKey()
rdd05.collect.foreach(println(_))
sc.stop()
/**
* 总结:
* cache()实际调用也是persist(StorageLevel.MEMORY_ONLY),默认缓存在JVM堆内存中,当内存溢出时会丢弃一部分数据
* persist(StorageLevel.DISK_ONLY) 可以设置不同的存储级别
* 当执行步骤耗时较长或者数据比较重要,可以考虑持久化操作
* cache、persist都是将数据进行临时存储,作业即所有job完成后会自动删除
**/
}
}
5.2 checkpoint
package sparkCore.rddPersist
import org.apache.spark.sql.SparkSession
object RDDPersistV2 {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName(this.getClass.getName)
.master("local[*]")
.getOrCreate()
val sc = spark.sparkContext
val rdd01 = sc.makeRDD(List("hello java", "hello scala", "hello spark"),2)
val rdd02 = rdd01.flatMap(x => {
println("flatMap..."); x.split(" ")
})
val rdd03 = rdd02.map(x => {
println("map..."); (x, 1)
})
//rdd03.cache() //persist(StorageLevel.MEMORY_ONLY)
//rdd03.persist(StorageLevel.DISK_ONLY)
//设置检查点存储路径,当作业即所有job完成后,不会被删除,可以跨作业使用
//但是这里会重新执行一遍逻辑,所有这里flatMap...和map...仍然会被输出
//所有为了提高效率,一般结合cache使用
sc.setCheckpointDir("checkPoint")
rdd03.cache()
rdd03.checkpoint()
val rdd04 = rdd03.reduceByKey(_ + _)
rdd04.collect.foreach(println(_))
println("*****************************")
val rdd05 = rdd03.groupByKey()
rdd05.collect.foreach(println(_))
sc.stop()
}
}
5.3 cache、persist、checkpoint区别
package sparkCore.rddPersist
import org.apache.spark.sql.SparkSession
object RDDPersistV3 {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName(this.getClass.getName)
.master("local[*]")
.getOrCreate()
val sc = spark.sparkContext
val rdd01 = sc.makeRDD(List("hello java", "hello scala", "hello spark"),2)
val rdd02 = rdd01.flatMap(_.split(" "))
val rdd03 = rdd02.map((_,1))
//rdd03.cache()
sc.setCheckpointDir("checkPoint")
rdd03.checkpoint()
println(rdd03.toDebugString) //执行Action类型算子之前的血缘关系
val rdd04 = rdd03.reduceByKey(_ + _)
rdd04.collect
println("*********************************")
println(rdd03.toDebugString)
sc.stop()
/**
* 总结:
* cache、persist会在血缘关系中添加新的依赖,当数据丢失时,可以重新获取数据。
* (2) MapPartitionsRDD[2] at map at RDDPersistV3.scala:17 [Memory Deserialized 1x Replicated]
* | MapPartitionsRDD[1] at flatMap at RDDPersistV3.scala:16 [Memory Deserialized 1x Replicated]
* | ParallelCollectionRDD[0] at makeRDD at RDDPersistV3.scala:14 [Memory Deserialized 1x Replicated]
* *********************************
* (2) MapPartitionsRDD[2] at map at RDDPersistV3.scala:17 [Memory Deserialized 1x Replicated]
* | CachedPartitions: 2; MemorySize: 560.0 B; ExternalBlockStoreSize: 0.0 B; DiskSize: 0.0 B
* | MapPartitionsRDD[1] at flatMap at RDDPersistV3.scala:16 [Memory Deserialized 1x Replicated]
* | ParallelCollectionRDD[0] at makeRDD at RDDPersistV3.scala:14 [Memory Deserialized 1x Replicated]
*
* checkpointh会切断血缘关系,建立新的血缘关系,相当于改变了数据源。如下:
* (2) MapPartitionsRDD[2] at map at RDDPersistV3.scala:17 []
* | MapPartitionsRDD[1] at flatMap at RDDPersistV3.scala:16 []
* | ParallelCollectionRDD[0] at makeRDD at RDDPersistV3.scala:14 []
* *********************************
* (2) MapPartitionsRDD[2] at map at RDDPersistV3.scala:17 []
* | ReliableCheckpointRDD[4] at collect at RDDPersistV3.scala:26 []
*
*
**/
}
}