reduce
def reduce(f: (T, T) => T): T
- 作用:f函数先聚合RDD的所有元素,先聚合分区内数据,再聚合分区间数据;
- 例子
object SP_reduce {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WC")
val sc: SparkContext = new SparkContext(conf)
val value: RDD[Int] = sc.textFile("data/1.txt").flatMap(x => x.split(" ")).map(s => s.toInt)
val str: Int = value.reduce(_ + _)
println(str)
sc.stop()
}
}
collect
def foreach[U](f: A => U): Unit
- 作用:在驱动程序中,以数组的形式返回数据集的所用元素
- 例子
object SP_collect {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WC")
val sc: SparkContext = new SparkContext(conf)
val value: RDD[Int] = sc.textFile("data/1.txt").flatMap(x => x.split(" ")).map(s => s.toInt)
value.collect().foreach(println)
sc.stop()
}
}
count
def count(): Long = sc.runJob(this, Utils.getIteratorSize _).sum
object SP_count {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WC")
val sc: SparkContext = new SparkContext(conf)
val value: RDD[Int] = sc.textFile("data/1.txt").flatMap(x => x.split(" ")).map(s => s.toInt)
println(value.count())
sc.stop()
}
}
first
def first(): T = withScope {
take(1) match {
case Array(t) => t
case _ => throw SparkCoreErrors.emptyCollectionError()
}
}
object SP_first {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WC")
val sc: SparkContext = new SparkContext(conf)
val value: RDD[Int] = sc.textFile("data/1.txt").flatMap(x => x.split(" ")).map(s => s.toInt)
println(value.first())
sc.stop()
}
}
take
def take(num: Int): Array[T]
- 作用:返回RDD的前几个元素,first也是用take实现的
- 例子
object SP_take {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WC")
val sc: SparkContext = new SparkContext(conf)
val value: RDD[Int] = sc.textFile("data/1.txt").flatMap(x => x.split(" ")).map(s => s.toInt)
value.take(5).foreach(println)
sc.stop()
}
}
takeOrdered
def takeOrdered(num: Int)(implicit ord: Ordering[T]): Array[T]
- 作用:返回RDD的前N个元素,自己可以实现自己的排序算法
- 例子
object SP_takeOrdered {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WC")
val sc: SparkContext = new SparkContext(conf)
val value: RDD[Int] = sc.textFile("data/1.txt").flatMap(x => x.split(" ")).map(s => s.toInt)
value.takeOrdered(3)(AgeOrdering).foreach(println)
sc.stop()
}
}
object AgeOrdering extends Ordering[Int] with Serializable {
override def compare(x: Int, y: Int): Int = y-x
}
aggregate
def aggregate[U: ClassTag](zeroValue: U)(seqOp: (U, T) => U, combOp: (U, U) => U): U
- 作用:每个分区里面的元素通过分区的逻辑和初始值进行聚合,然后用分区间逻辑和初始值进行操作;也就是分区内和分区间都用初始值计算一遍
- 例子
object SP_aggregate {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WC")
val sc: SparkContext = new SparkContext(conf)
val value: RDD[Int] = sc.makeRDD(List(1, 2, 3, 4),8)
val str: Int = value.aggregate(10)(_+_,_+_)
println(str)
sc.stop()
}
}
fold
def fold(zeroValue: T)(op: (T, T) => T): T
- 作用:每个分区里面的元素通过分区的逻辑和初始值进行聚合,然后用分区间逻辑和初始值进行操作;也就是分区内和分区间都用初始值计算一遍,分区内和分区间的函数是同一个.
- 例子
object SP_fold {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WC")
val sc: SparkContext = new SparkContext(conf)
val value: RDD[Int] = sc.makeRDD(List(1, 2, 3, 4),8)
val str: Int = value.fold(10)(_+_)
println(str)
sc.stop()
}
}
countByKey
def countByKey(): Map[K, Long] = self.withScope {
self.mapValues(_ => 1L).reduceByKey(_ + _).collect().toMap
}
- 作用:统计每个key的个数,实现是借助reduceByKey来实现的
- 例子
object SP_countByKey {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WC")
val sc: SparkContext = new SparkContext(conf)
val rdd1: RDD[(String, Int)] = sc.makeRDD(List(("a", 3), ("a", 2), ("c", 4), ("b", 3), ("c", 6), ("f", 8)), 2)
rdd1.countByKey().foreach(println)
sc.stop()
}
}
save相关算子
- saveAsTextFile(path):保存文本文件
- saveAsSequenceFile(path) :保存序列化文件
- saveAsObjectFile(path) :保存序列化成对象保存到文件