1、dataframe获取分区个数
scala> // 构造测试数据源
scala> val df = spark.sparkContext.makeRDD(1.to(100), 4).toDF("id")
df: org.apache.spark.sql.DataFrame = [id: int]
scala> // 获取分区个数
scala> val partition_num=df.rdd.partitions.length
partition_num: Int = 4
scala> df.rdd.getNumPartitions
res58: Int = 4
scala> df.rdd.partitions.length
res59: Int = 4
scala> // 打印分区个数
scala> println(partition_num)
4
2、dataframe获取每个分区的内容
scala> // 构造测试数据
scala> val df = spark.sparkContext.makeRDD(1.to(100), 4).toDF("id")
df: org.apache.spark.sql.DataFrame = [id: int]
scala> // 每个分区的元素封装到一个数组中, 返回一个新的rdd
scala> val res = df.rdd.glom()
res: org.apache.spark.rdd.RDD[Array[org.apache.spark.sql.Row]] = MapPartitionsRDD[65] at glom at <console>:25
scala> // rdd转换成便于观察的格式
scala> val res02 = res.map(x => x.map(_.getInt(0)).mkString(","))
res02: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[66] at map at <console>:25
scala> // 打印每个分区的内容
scala> res02.foreach(println)
1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50
76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75
3、如何查看单个RDD分区的内容(创建分区,查看分区数)
scala> val scores = Array(("Fred", 88), ("Fred", 95), ("Fred", 91), ("Wilma", 93), ("Wilma", 95), ("Wilma", 98))
scores: Array[(String, Int)] = Array((Fred,88), (Fred,95), (Fred,91), (Wilma,93), (Wilma,95), (Wilma,98))
scala> val input = sc.parallelize(scores,3)
input: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[67] at parallelize at <console>:26
scala> input.partitions.size
res22: Int = 3
scala>
scala> input.glom().collect()
res23: Array[Array[(String, Int)]] = Array(Array((Fred,88), (Fred,95)), Array((Fred,91), (Wilma,93)), Array((Wilma,95), (Wilma,98)))