cache,persist,checkpoint三者的区别
- chche:缓存在内存中,效率高但是不安全,存在数据丢失和内存溢出的风险。而且会在血缘关系中添加依赖
- persist:将数据落在磁盘中,可选等级也较为安全。但是在计算完成之后会删除。会在血缘关系中添加依赖
- checkpoint:将数据永久保存在路径中(HDFS),可永久保存但是不参与原本的计算会独立计算一次。为了提升效率,一般和cache联合使用。执行过程中会切断血缘关系。重新建立新的血缘关系,等同于改变数据源
val sc: SparkContext = new SparkContext(conf)
sc.setCheckpointDir("checkpoint")
val rdd: RDD[String] = sc.textFile("datas")
rdd.cache()
rdd.checkpoint()
分区器(自定义分区)
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
object my_partition {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("my_partition")
val sc = new SparkContext(conf)
val list: List[(String, String)] = List(("python", "xxxxxxxxxx"),
("java", "xxxxxxxxxx"),
("scala", "xxxxxxxxxx"),
("spark", "xxxxxxxxxx"),
("scala", "xxxxxxxxxx")
)
val rdd: RDD[(String, String)] = sc.makeRDD(list, 3)
println("makeRDD")
rdd.cache()
val keys: List[String] = rdd.map(_._1).distinct().collect().toList
val numPartitions: Int = keys.size
val repRDD: RDD[(String, String)] = rdd.partitionBy(new myPartitioner(numPartitions, keys))
repRDD.saveAsTextFile("output")
sc.stop()
}
}
class myPartitioner(num: Int, list: List[String]) extends Partitioner {
override def numPartitions: Int = num
override def getPartition(key: Any): Int = {
val index: Int = list.indexOf(key)
index
}
}
累加器(分布式共享只写变量)
val rdd: RDD[Int] = sc.makeRDD(List(1, 2, 3, 4))
val acc = sc.longAccumulator("acc")
rdd.foreach(
num => {
acc.add(num)
}
)
println(acc.value)
自定义累加器进行wordcount
package com.yan
import org.apache.spark.rdd.RDD
import org.apache.spark.util.AccumulatorV2
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
object wordCountAcc {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("my_partition")
val sc = new SparkContext(conf)
val rdd: RDD[String] = sc.makeRDD(List("hello", "spark", "scala", "spark"))
val wcAcc = new myAcc()
sc.register(wcAcc, "wc")
rdd.foreach(
word =>
wcAcc.add(word)
)
println(wcAcc.value)
sc.stop()
}
}
class myAcc extends AccumulatorV2[String, mutable.Map[String, Long]] {
private var wcMap = mutable.Map[String, Long]()
override def isZero: Boolean = {
wcMap.isEmpty
}
override def copy(): AccumulatorV2[String, mutable.Map[String, Long]] = {
new myAcc()
}
override def reset(): Unit = {
wcMap.clear()
}
override def add(v: String): Unit = {
val newMapCount = wcMap.getOrElse(v, 0L) + 1
wcMap.update(v, newMapCount)
}
override def merge(other: AccumulatorV2[String, mutable.Map[String, Long]]): Unit = {
other.value.foreach({
case (word, count) => {
val newCount: Long = this.wcMap.getOrElse(word, 0L) + count
this.wcMap.update(word, newCount)
}
})
}
override def value: mutable.Map[String, Long] = this.wcMap
}