partitioner是Spark RDD的重要组成部分,记录了数据split的逻辑是shuffle过程中key重新分区的策略,决定了key被分到哪个分区内,RDD是k-v二元组数据形式时可继承Partitioner自定义分区。
** *
* 自定义分区逻辑:对words集合中每个word只分到一个分区内
*
* @param words
*/
class MyPartitioner(words: Array[String]) extends Partitioner {
// 定义word与分区编号映射关系map
val wordMap = new mutable.HashMap[String, Int]()
var count = 0
for (word <- words) {
wordMap += (word -> count)
count += 1
}
//分区数量与word数量一致
override def numPartitions: Int = words.length
//获取分区编号
override def getPartition(key: Any): Int = {
wordMap(key.toString)
}
// 自定义分区
rdd.partitionBy(new MyPartitioner(words))