private[spark]
class PartitionerAwareUnionRDD[T: ClassTag](
sc: SparkContext,
var rdds: Seq[RDD[T]]
) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) {
require(rdds.length > 0)
require(rdds.forall(_.partitioner.isDefined))
require(rdds.flatMap(_.partitioner).toSet.size == 1,
"Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner))
override val partitioner = rdds.head.partitioner
//生成PartitionerAwareUnionRDDPartition,保存了组成某个分区索引为index的分区来源于rdds的哪几个分区
override def getPartitions: Array[Partition] = {
val numPartitions = partitioner.get.numPartitions
(0 until numPartitions).map(index => {
new PartitionerAwareUnionRDDPartition(rdds, index)
}).toArray
}
// Get the location where most of the partitions of parent RDDs are located
override def getPreferredLocations(s: Partition): Seq[String] = {
logDebug("Finding preferred location for " + this + ", partition " + s.index)
val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents
val locations = rdds.zip(parentPartitions).flatMap {
case (rdd, part) => {
val parentLocations = currPrefLocs(rdd, part)
logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations)
parentLocations
}
}
val location = if (locations.isEmpty) {
None
} else {
// Find the location that maximum number of parent partitions prefer
Some(locations.groupBy(x => x).maxBy(_._2.length)._1)
}
logDebug("Selected location for " + this + ", partition " + s.index + " = " + location)
location.toSeq
}
override def compute(s: Partition, context: TaskContext): Iterator[T] = {
//parents即指向了该分区来源于的rdds组合的哪几个分区
val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents //然后就是遍历原始rdds组合的某几个分区组成单个分区
rdds.zip(parentPartitions).iterator.flatMap {
case (rdd, p) => rdd.iterator(p, context)
}
}
override def clearDependencies() {
super.clearDependencies()
rdds = null
}
// Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones)
private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = {
rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host)
} } |