-
基础转换操作
-
键值转换操作
键值转换操作
-
partitionBy(partitioner: Partitioner):RDD[(K,V)]
将原来的RDD根据给定的Partitioner函数进行重新分区。
scala> val rdd = sc.makeRDD(Array((1,"A"),(2,"B"),(3,"C"),(4,"D")), 2)
rdd: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[0] at makeRDD at <console>:24
# 查看每个分区的元素
scala> rdd.mapPartitionsWithIndex{
| (partIdx, iter) => {
| var part_map = scala.collection.mutable.Map[String, List[(Int,String)]]()
| while(iter.hasNext){
| var part_name = "part_" + partIdx;
| var elem = iter.next()
| if(part_map.contains(part_name)) {
| var elems = part_map(part_name)
| elems ::= elem
| part_map(part_name) = elems
| }
| else{
| part_map(part_name) = List[(Int,String)]{elem}
| }
| }
| part_map.iterator
| }
| }.collect
res5: Array[(String, List[(Int, String)])] = Array((part_0,List((2,B), (1,A))), (part_1,List((4,D), (3,C))))
# 使用partitionBy重新分区
scala> var rddNew = rdd.partitionBy(new org.apache.spark.HashPartitioner(2))
rddNew: org.apache.spark.rdd.RDD[(Int, String)] = ShuffledRDD[2] at partitionBy at <console>:25
# 查看新的分区元素
scala> rddNew.mapPartitionsWithIndex{
| (partIdx, iter) => {
| var part_map = scala.collection.mutable.Map[String, List[(Int,String)]]()
| while(iter.hasNext){
| var part_name = "part_" + partIdx;
| var elem = iter.next()
| if(part_map.contains(part_name)) {
|