sc.parallelize 数据分区划分
1. parallelize 方法
分区相关核心代码
def parallelize[T: ClassTag](
seq: Seq[T], 传入数据
numSlices: Int = defaultParallelism 分区数不传就是默认值
): RDD[T] = withScope {
assertNotStopped()
new ParallelCollectionRDD[T](this, seq, numSlices, Map[Int, Seq[String]]())
}
获取分区
override def getPartitions: Array[Partition] = {
val slices = ParallelCollectionRDD.slice(data, numSlices).toArray
slices.indices.map(i => new ParallelCollectionPartition(id, i, slices(i))).toArray
}
def slice[T: ClassTag](seq: Seq[T], numSlices: Int): Seq[Seq[T]] = {
if (numSlices < 1) {
throw new IllegalArgumentException("Positive number of partitions required")
}
//****************************获取开始计数下标***********************************
def positions(length: Long, numSlices: Int): Iterator[(Int, Int)] = {
(0 until numSlices).iterator.map { i =>
val start = ((i * length) / numSlices).toInt
val end = (((i + 1) * length) / numSlices).toInt
(start, end)
}
}
seq match {
case r: Range =>
positions(r.length, numSlices).zipWithIndex.map { case ((start, end), index) =>
// If the range is inclusive, use inclusive range for the last slice
if (r.isInclusive && index == numSlices - 1) {
new Range.Inclusive(r.start + start * r.step, r.end, r.step)
}
else {
new Range(r.start + start * r.step, r.start + end * r.step, r.step)
}
}.toSeq.asInstanceOf[Seq[Seq[T]]]
case nr: NumericRange[_] =>
// For ranges of Long, Double, BigInteger, etc
val slices = new ArrayBuffer[Seq[T]](numSlices)
var r = nr
for ((start, end) <- positions(nr.length, numSlices)) {
val sliceSize = end - start
slices += r.take(sliceSize).asInstanceOf[Seq[T]]
r = r.drop(sliceSize)
}
slices
case _ =>
val array = seq.toArray // To prevent O(n^2) operations for List etc
***************************得到下标*********************
positions(array.length, numSlices).map { case (start, end) =>
//**************************获取分区数据***********************
array.slice(start, end).toSeq
}.toSeq
}
}
//***************返回分区数据**********************
def slice(from: Int, until: Int): Repr = {
val lo = math.max(from, 0)
val hi = math.min(math.max(until, 0), length)
val elems = math.max(hi - lo, 0)
val b = newBuilder
b.sizeHint(elems)
var i = lo
while (i < hi) {
b += self(i)
i += 1
}
b.result()
}
2. 例子解析
以sc.parallelize(Array(1,2,3,4,5),3)为例子
positions(5,3)
(0,5/3) =>(0,1)
slice(0,1)=>b.(0) =>(1)
(5/3,10/3) =>(1,3)
slice(1,3)=>b.(1),b.(2) =>(2,3)
(10/3,15/3) =>(3,5)
slice(3,5)=>b.(3),b(4) =>(4,5)
验证结果
sc.parallelize(Array(1,2,3,4,5),3)这个呢
scala> sc.parallelize(Array(1,2,3,4,5),3).glom.collect
res21: Array[Array[Int]] = Array(Array(1), Array(2, 3), Array(4, 5))