/**
* Persist this RDD with the default storage level (`MEMORY_ONLY`).
*/
def persist():this.type =persist(StorageLevel.MEMORY_ONLY)
persist(newLevel: StorageLevel) 源码
/**
* Set this RDD's storage level to persist its values across operations after the first time
* it is computed. This can only be used to assign a new storage level if the RDD does not
* have a storage level set yet. Local checkpointing is an exception.
*/
def persist(newLevel: StorageLevel):this.type ={if(isLocallyCheckpointed){// This means the user previously called localCheckpoint(), which should have already// marked this RDD for persisting. Here we should override the old storage level with// one that is explicitly requested by the user (after adapting it to use disk).persist(LocalRDDCheckpointData.transformStorageLevel(newLevel), allowOverride =true)}else{persist(newLevel, allowOverride =false)}}
StorageLevel 源码
/**
* Various [[org.apache.spark.storage.StorageLevel]] defined and utility functions for creating
* new storage levels.
*/
object StorageLevel {
val NONE =newStorageLevel(false,false,false,false)
val DISK_ONLY =newStorageLevel(true,false,false,false)
val DISK_ONLY_2 =newStorageLevel(true,false,false,false,2)
val MEMORY_ONLY =newStorageLevel(false,true,false,true)
val MEMORY_ONLY_2 =newStorageLevel(false,true,false,true,2)
val MEMORY_ONLY_SER =newStorageLevel(false,true,false,false)
val MEMORY_ONLY_SER_2 =newStorageLevel(false,true,false,false,2)
val MEMORY_AND_DISK =newStorageLevel(true,true,false,true)
val MEMORY_AND_DISK_2 =newStorageLevel(true,true,false,true,2)
val MEMORY_AND_DISK_SER =newStorageLevel(true,true,false,false)
val MEMORY_AND_DISK_SER_2 =newStorageLevel(true,true,false,false,2)
val OFF_HEAP =newStorageLevel(true,true,true,false,1)
eg:
val a = sc.parallelize(1 to 100)
a.cache()
a.persist()
a.persist(StorageLevel.MEMORY_ONLY)
cache 本质是 persist
/**
* Persist this RDD with the default storage level (`MEMORY_ONLY`).
*/
def cache():this.type =persist()
/**
* Mark this RDD for checkpointing. It will be saved to a file inside the checkpoint
* directory set with `SparkContext#setCheckpointDir` and all references to its parent
* RDDs will be removed. This function must be called before any job has been
* executed on this RDD. It is strongly recommended that this RDD is persisted in
* memory, otherwise saving it on a file will require recomputation.
*/
def checkpoint(): Unit = RDDCheckpointData.synchronized{// NOTE: we use a global lock here due to complexities downstream with ensuring// children RDD partitions point to the correct parent partitions. In the future// we should revisit this consideration.if(context.checkpointDir.isEmpty){thrownewSparkException("Checkpoint directory has not been set in the SparkContext")}elseif(checkpointData.isEmpty){
checkpointData =Some(newReliableRDDCheckpointData(this))}}
eg:
val a = sc.parallelize(1 to 100)
sc.setCheckpointDir("hdfs://192.168.72.2:8020/checkpoint/20190521")
a.persist(StorageLevel.MEMORY_ONLY)
a.checkpoint()
/**
* Broadcast a read-only variable to the cluster, returning a
* [[org.apache.spark.broadcast.Broadcast]] object for reading it in distributed functions.
* The variable will be sent to each cluster only once.
*
* @param value value to broadcast to the Spark nodes
* @return `Broadcast` object, a read-only variable cached on each machine
*/
def broadcast[T: ClassTag](value: T): Broadcast[T]={assertNotStopped()require(!classOf[RDD[_]].isAssignableFrom(classTag[T].runtimeClass),"Can not directly broadcast RDDs; instead, call collect() and broadcast the result.")
val bc = env.broadcastManager.newBroadcast[T](value, isLocal)
val callSite = getCallSite
logInfo("Created broadcast "+ bc.id +" from "+ callSite.shortForm)
cleaner.foreach(_.registerBroadcastForCleanup(bc))
bc
}
eg:
假设一个地区有10万个免费WIFI,100万个用户,现在有10亿条连接信息
10万个免费WIFI组成一个 A 表: WIFI_ID , POS
10亿条连接信息组成一个 B 表 : USER_ID,WIFI_ID,TIME,MESSAGE
100万个用户组成一个 C 表: USER_ID , USER_NAME
现在要跑出一张结果表 D :USER_NAME,POS,TIME,MESSAGE
val A = sc.textFile("hdfs://192.168.72.2:8020/wifi").map(line =>{
val fields = line.split("\\|")
val wifi =fields(0)
val pos =fields(1)(wifi, pos)})
val B = sc.textFile("hdfs://192.168.72.2:8020/connectionInfo").map(line =>{
val fields = line.split("\\|")
val userID =fields(0)
val wifiID =fields(1)
val time =fields(2)
val message =fields(3)(userID, wifiID, time, message)})
val C = sc.textFile("hdfs://192.168.72.2:8020/user").map(line =>{
val fields = line.split("\\|")
val userID =fields(0)
val userName =fields(1)(userID, userName)})//生成不可变的集合,广播到task中去
val wifiPosBroakcast = A.collect()
val wifiPos = sc.broadcast(wifiPosBroakcast)
val userBroakcast = C.collect()
val user = sc.broadcast(userBroakcast)
def mapPartitionFunc(iter: Iterator[(String, String, String, String)]): Iterator[(String, String, String, String)]={
var result = ListBuffer[(String, String, String, String)]()
val wifis = wifiPos.value
val users = user.value
// result = iter.join(wifis).join(users)
result.iterator
}
B.mapPartitions(mapPartitionFunc)
/**
* Create and register a long accumulator, which starts with 0 and accumulates inputs by `add`.
*/
def longAccumulator(name: String): LongAccumulator ={
val acc =newLongAccumulatorregister(acc, name)
acc
}
override def add(v: T): Unit = _value = param.addAccumulator(_value, v)
override def value: jl.Long = _sum
eg;
al accum = sc.longAccumulator("My Accumulator")
sc.parallelize(Array(1,2,3,4)).foreach(x => accum.add(x))
accum.value //10