def coalesce ( numPartitions : Int , shuffle : Boolean = false ): RDD [T]
def repartition ( numPartitions : Int ): RDD [T]
coalesce和repartition将关联数据合并到给定数量的分区中。 repartition对数据进行shuffle
java代码如下:
package com.cb.spark.sparkrdd;
import java.util.Arrays;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
public class CoalesceExample {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("Coalesce").setMaster("local");
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaRDD<Integer> rdd = jsc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 10);
System.out.println(rdd.partitions().size());// 10
JavaRDD<Integer> coalesceRDD = rdd.coalesce(2, false);// 将原来十个partition的数据合并到两个partition当中
System.out.println(coalesceRDD.partitions().size());// 2
JavaRDD<Integer> repartitionRDD = rdd.repartition(5);
System.out.println(repartitionRDD.partitions().size());// 5
jsc.stop();
}
}