Spark RDD算子(十三)RDD分区HashPartitioner、RangePartitioner、自定义分区
HashPartitioner
- 默认分区就是HashPartitioner
package nj.zb.sparkstu
import org.apache.spark.rdd.RDD
import org.apache.spark.{HashPartitioner, SparkConf, SparkContext}
object HashPartitioner {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("HashPartitioner")
val sc: SparkContext = new SparkContext(conf)
val rdd1: RDD[(Int, Int)] = sc.parallelize(List((1,1),(1,2),(2,3),(2,4),(3,5),(3,6),(4,7),(4,8),(5,9),(5,10)))
//未分区输出
rdd1.foreachPartition(partition=>println(partition.length))
println("-----------------------------")
val rdd2 = rdd1.partitionBy(new HashPartitioner(3))
//分区输出
rdd2.foreachPartition(partition=>println(partition.length))
}
}
结果展示:
RangePartitioner
- 使用一个范围,将范围内的键分配给相应的分区。这种方法适用于键中有自然排序,键不为负
package nj.zb.sparkstu
import org.apache.spark.rdd.RDD
import org.apache.spark.{RangePartitioner, SparkConf, SparkContext}
object RangePartitioner {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("RangePartitioner")
val sc: SparkContext = new SparkContext(conf)
val rdd1: RDD[(Int, Int)] = sc.parallelize(List((1,1), (5,10), (5,9), (2,4), (3,5), (3,6),(4,7), (4,8),(2,3), (1,2)))
rdd1.foreachPartition(x=>println(x.length))
println("---------------------------")
val rdd2: RDD[(Int, Int)] = rdd1.partitionBy(new RangePartitioner(3,rdd1))
rdd2.foreachPartition(x=>println(x.length))
}
}
结果展示:
自定义分区
package nj.zb.sparkstu
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partitioner, SparkConf, SparkContext}
class zidingyi(numParts:Int) extends Partitioner {
override def numPartitions: Int = numParts
override def getPartition(key: Any): Int = {
if (key.toString.toInt >= 4) {
0
} else if (key.toString.toInt >= 2 && key.toString.toInt < 4) {
1
} else {
2
}
}
}
object zidingyi{
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("zidignyi")
val sc: SparkContext = new SparkContext(conf)
val rdd1: RDD[(Int, Int)] = sc.parallelize(List((1,1), (5,10), (5,9), (2,4), (3,5), (3,6),(4,7), (4,8),(2,3), (1,2)))
val rdd2: RDD[(Int, Int)] = rdd1.partitionBy(new zidingyi(3))
rdd2.foreachPartition(x=>println(x.length))
}
}
结果展示: