scala> rdd2.flatMap(x=>x).collect res3: Array[Char] = Array(h, e, l, l, o, w, o, r, l, d, h, o, w, a, r, e, y, o, u, ?, n, i, h, a, o, h, e, l, l, o, t, o, m)
scala> val rdd3 = rdd2.distinct rdd3: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[7] at distinct at <console>:30
查看默认rdd分区数 scala> rdd.partitions.size res4: Int = 2
默认分区2个,往小了分Yes 修改rdd分区数,并生成新的rdd scala> val rdd4 = rdd.coalesce(1) rdd4: org.apache.spark.rdd.RDD[String] = CoalescedRDD[8] at coalesce at <console>:26
scala> rdd4.partitions.size res10: Int = 1
默认分区2个,往大了分NO scala> val rdd5 = rdd.coalesce(5) rdd5: org.apache.spark.rdd.RDD[String] = CoalescedRDD[9] at coalesce at <console>:26
scala> rdd5.partitions.size res12: Int = 2
默认分区2个,往大了分 增加属性shuffle设为true scala> val rdd5 = rdd.coalesce(5,true) rdd5: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[13] at coalesce at <console>:26
scala> rdd5.partitions.size res13: Int = 5
再分区 repartition ,可增可减分区 scala> val rdd6 = rdd5.repartition(8) rdd6: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[11] at repartition at <console>:34
3.randomSplit: def randomSplit(weights: Array[Double], seed: Long = Utils.random.nextLong): Array[RDD[T]] 说明:将RDD按照权重(weights)进行随机分配,返回指定个数的RDD集合; 应用案例:Hadoop全排操作 scala> val rdd = sc.parallelize(List(1,2,3,4,5,6,7)) rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:24
0.7+0.1+0.2 = 1 ,将rdd中的7个元素按权重分配, 权重加起来一定等于1 scala> val rdd1 = rdd.randomSplit(Array(0.7,0.1,0.2)) rdd1: Array[org.apache.spark.rdd.RDD[Int]] = Array(MapPartitionsRDD[1] at randomSplit at <console>:26, MapPartitionsRDD[2] at randomSplit at <console>:26, MapPartitionsRDD[3] at randomSplit at <console>:26)
4.glom 说明:返回每个分区中的数据项 val rdd = sc.parallelize(List(1,2,3,4,5,6,7,8,9)) rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[0] at parallelize at <console>:24
5.union:并集 将两个RDD进行合并,不去重 scala> val rdd = sc.parallelize(Array(9,8,7,6,5,4,3,2)) rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[4] at parallelize at <console>:24
val rdd1 = sc.parallelize(List(1,2,3,4,5,6,7,8,9)) rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[3] at parallelize at <console>:24
val rdd1 = sc.parallelize(List(1,2,3,4,5,6,7,8,9)) rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[3] at parallelize at <console>:24
object MapPartitions {
def main(args: Array[String]): Unit = { val conf = new SparkConf().setMaster("local").setAppName("MapPartions") val sc =new SparkContext(conf) sc.setLogLevel("ERROR") val s =sc.parallelize(1 to 9,3) s.collect() }
def myfunc[T](iter: Iterator[T]) : Iterator[(T,T)]={ var list = List[(T,T)]() var res1 = iter.next() while(iter.hasNext){ val res2 = iter.next() list.::=(res1,res2) res1 =res2 } list.iterator } }
10.zip 组合新的RDD def zip[U: ClassTag](other: RDD[U]): RDD[(T, U)] 说明:1.两个RDD之间数据类型可以不同; 2.要求每个RDD具有相同的分区数 3.需RDD的每个分区具有相同的数据个数 个数不相同报错 r.TaskSetManager: Lost task 0.0 in stage 11.0 (TID 29, 192.168.179.131, executor 1): org.apache.spark.SparkException: Can only zip RDDs with same number of elements in each partition
scala> val rdd = sc.parallelize(1 to 10,3) rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[34] at parallelize at <console>:24
val rdd1 = sc.parallelize(List("a","b","c","d","e","f","g","h","i","j"),3) rdd1: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[38] at parallelize at <console>:24
scala> val rdd1 = sc.parallelize(List("a","b","c","d","e","f","g","h","i","j"),3) rdd1: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[38] at parallelize at <console>:24
scala> val rdd = sc.parallelize(List((1,"a"),(1,"f"),(2,"b"),(2,"c"),(3,"d"))) rdd: org.apache.spark.rdd.RDD[(Int, String)] = ParallelCollectionRDD[0] at parallelize at <console>:24
scala> val rdd = sc.parallelize(List("one","two","three","four","five")) rdd: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[7] at parallelize at <console>:24
scala> val rdd1 = sc.parallelize(1 to rdd.count.toInt) rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[8] at parallelize at <console>:26
聚合操作 1.mapValues[Pair] def mapValues[U](f: V => U): RDD[(K, U)] 说明:将RDD[(K, V)] --> RDD[(K, U)],对Value做(f: V => U)操作,Key不变
scala> val rdd = sc.parallelize(List("one","two","three","four","five")) rdd: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[23] at parallelize at <console>
scala> val a = sc.parallelize(List("dog","cat","gnu","salmon","rabbit","turkey","wolf","bear","bee"), 3) a: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[30] at parallelize at <console>:24
scala> val b = sc.parallelize(List(1,1,2,2,2,1,2,2,2), 3) b: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[31] at parallelize at <console>:24
scala> val c = b.zip(a) c: org.apache.spark.rdd.RDD[(Int, String)] = ZippedPartitionsRDD2[32] at zip at <console>:28
scala> val rdd = sc.parallelize(List("one","two","three","four","five")) rdd: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[36] at parallelize at <console>:24
7.join def join[W](other: RDD[(K, W)]): RDD[(K, (V, W))] scala> val a = sc.parallelize(List("dog", "salmon", "salmon", "rat", "elephant"), 3) a: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[43] at parallelize at <console>:24
scala> val b = a.keyBy(_.length) b: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[44] at keyBy at <console>:26
scala> val c = sc.parallelize(List("dog","cat","gnu","salmon","rabbit","turkey","wolf","bear","bee"), 3) c: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[45] at parallelize at <console>:24
scala> val d = c.keyBy(_.length) d: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[46] at keyBy at <console>:26
2.take(num:Int)Array[T]表示获取RDD中从0到num-1下标的元素,不排序 scala> var rdd1 = sc.makeRDD(Seq(10,4,3,12,3)) rdd1: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[4] at makeRDD at <console>:24
7.aggregate def aggregate[U: ClassTag](zeroValue: U)(seqOp: (U, T) => U, combOp: (U, U) => U): U 说明:aggregate用于聚合RDD中的元素,先使用seqOp将RDD中每个分区中的T类型元素聚合成U类型, 再使用combOp将之前每个分区聚合后的U类型聚合成U类型 特别注意seqOp和combOp都会使用zeroValue的值,zeroValue的类型为U val z = sc.parallelize(List(1,2,3,4,5,6), 2) z.aggregate(0)(math.max(_, _), _ + _)
res40: Int = 9 说明: step1:首先在第一个分区[0,1,2,3]中执行math.max,结果为:3 step2:在第二个分区[0,4,5,6]中执行math.max,结果为:6 stepn:在第N个分区中执行math.max,结果为:max step:将所有分区结果执行combOp(_+_),0+3+6=9
修改初值为5: z.aggregate(5)(math.max(_, _), _ + _)
结果是多少?
说明: // This example returns 16 since the initial value is 5 // reduce of partition 0 will be max(5, 1, 2, 3) = 5 // reduce of partition 1 will be max(5, 4, 5, 6) = 6 // final reduce across partitions will be 5 + 5 + 6 = 16 // note the final reduce include the initial value
案例说明: scala> val z = sc.parallelize(List("12","23","345","4567"),2) z: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[3] at parallelize at <console>:24