import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.SparkContext._
import scala.collection.{mutable, Iterator}
object Ex5_Partitions {
// 打印RDD的工具方法,此方法的目的就是为了使RDD的打印更方便理解
def analyze[T](r: RDD[T]) : Unit = {
//将每一个分区的数据转为数组
val partitions = r.glom()
println(partitions.count() + " partitions")
// zipWithIndex前面讲过,这里略过,下面的这个就是将每个分区的数组打印,中间空格分割
partitions.zipWithIndex().collect().foreach {
case (a, i) => {
println("Partition " + i + " contents:" +
a.foldLeft("")((e, s) => e + " " + s))
}
}
}
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("Ex5_Partitions").setMaster("local[4]")
val sc = new SparkContext(conf)
// 生成RDD
val numbers = sc.parallelize(1 to 100, 4)
println("original RDD:")
analyze(numbers)
// 打印小于34的值
val some = numbers.filter(_ < 34)
println("filtered RDD")
analyze(some)
// 该函数类似于intersection,但返回在RDD中出现,并且不在otherRDD中出现的元素,不去重。
val diff = numbers.subtract(some)
println("the complement:")
analyze(diff)
println("it is a " + diff.getClass.getCanonicalName)
// 该函数类似于intersection,但返回在RDD中出现,并且不在otherRDD中出现的元素,不去重,分为4个分区,会重新分区数据
val diffSamePart = numbers.subtract(some, 4)
println("the complement (explicit but same number of partitions):")
analyze(diffSamePart)
// 更改了分区数,会更新分区数据
val diffMorePart = numbers.subtract(some, 6)
println("the complement (different number of partitions):")
analyze(diffMorePart)
println("it is a " + diffMorePart.getClass.getCanonicalName)
// 这个写法可以使用分区内过滤,减少了通信开销
def subtractFunc(wholeIter: Iterator[Int], partIter: Iterator[Int]) :
Iterator[Int] = {
val partSet = new mutable.HashSet[Int]()
partSet ++= partIter
wholeIter.filterNot(partSet.contains(_))
}
// zipPartitions要求两个RDD有相同的分区,但是不要求每个分区中的元素个数相同
val diffOriginalPart = numbers.zipPartitions(some)(subtractFunc)
println("complement with original partitioning")
analyze(diffOriginalPart)
println("it is a " + diffOriginalPart.getClass.getCanonicalName)
// TODO: coalesce
/**
* 将数据重新分区repartition底层就是用coalesce进行的实现,这里有个概念叫窄依赖与宽依赖,
* 窄依赖:父分区与子分区的关系是N-1
* 宽依赖:父分区与子分区的关系是N-N
* some.coalesce(3, true)等价于some.repartition(3),因为repartition底层就是coalesce(3, true)
*
*/
val threePart = numbers.repartition(3)
println("numbers in three partitions")
analyze(threePart)
println("it is a " + threePart.getClass.getCanonicalName)
val twoPart = some.coalesce(3, true)
println("subset in three partitions after a shuffle")
analyze(twoPart)
println("it is a " + twoPart.getClass.getCanonicalName)
// 不会进行分区整理
val twoPartNoShuffle = some.coalesce(2, false)
println("subset in three partitions without a shuffle")
analyze(twoPartNoShuffle)
println("it is a " + twoPartNoShuffle.getClass.getCanonicalName)
// group需要Shuffled
val groupedNumbers = numbers.groupBy(n => if (n % 2 == 0) "even" else "odd")
println("numbers grouped into 'odd' and 'even'")
analyze(groupedNumbers)
println("it is a " + groupedNumbers.getClass.getCanonicalName)
// preferredLocations
numbers.partitions.foreach(p => {
println("Partition: " + p.index)
numbers.preferredLocations(p).foreach(s => println(" Location: " + s))
})
// mapPartitions to achieve in-place grouping
// TODO: fix this example ot make it a bit more natural
/**
* 这个讲解的是关键字yield,yield会将循环过程中的每一次迭代结果记录下来返回一个集合
*/
val pairs = sc.parallelize(for (x <- 1 to 6; y <- 1 to x) yield ("S" + x, y), 4)
analyze(pairs)
/**
* 该函数用于RDD[K,V]根据K将V做折叠、合并处理,其中的参数zeroValue表示先根据映射函数将zeroValue应用于V,进行初始化V,
* 再将映射函数应用于初始化后的V.
*
* var rdd1 = sc.makeRDD(Array(("A",0),("A",2),("B",1),("B",2),("C",1)))
* rdd1.foldByKey(0)(_+_).collect
* Array[(String, Int)] = Array((A,2), (B,3), (C,1))
*
* 下面的例子的意思是 按照相同的key分组,然后在各个分区中计算0和所有值的和
*/
val rollup = pairs.foldByKey(0, 4)(_ + _)
println("just rolling it up")
analyze(rollup)
/**
* map和mapPartitions的区别
* map源码
* def map[U: ClassTag](f: T => U): RDD[U] = withScope {
* val cleanF = sc.clean(f)
* new MapPartitionsRDD[U, T](this, (context, pid, iter) => iter.map(cleanF))
* }
*
* mapPartitions源码
* def mapPartitions[U: ClassTag](
* f: Iterator[T] => Iterator[U],
* preservesPartitioning: Boolean = false): RDD[U] = withScope {
* val cleanedF = sc.clean(f)
* new MapPartitionsRDD(
* this,
* (context: TaskContext, index: Int, iter: Iterator[T]) => cleanedF(iter),
* preservesPartitioning)
* }
*
* 从源代码里可以很明显的看出来map的原理,map的原理就是根据传入的iter然后对所有的值进行cleanF方法,为什么是cleanF,这个是由于
* spark进行了闭包清理。
*
* 而mapPartitions也很简单,他的意思就是对每个分区的iter进行cleanedF,所以可以明显看出区别
*
* 下面的例子中rollupFunc就是说在每个分区中将相同的key的值相加
*/
def rollupFunc(i: Iterator[(String, Int)]) : Iterator[(String, Int)] = {
val m = new mutable.HashMap[String, Int]()
i.foreach {
case (k, v) => if (m.contains(k)) m(k) = m(k) + v else m(k) = v
}
m.iterator
}
val inPlaceRollup = pairs.mapPartitions(rollupFunc, true)
println("rolling it up really carefully")
analyze(inPlaceRollup)
}
}
Githup项目LearningSpark代码讲解(六)
最新推荐文章于 2024-07-20 01:01:27 发布