sc.makeRDD(0 to 10)
res0.collect
sc.parallelize(1 to 10)
res2.collect
sc.makeRDD(Array(1,2,3))
数值型RDD[Int] RDD[Int,(String,Int)]
键值对RDD(Int,String)
所有的键值对型RDD都可以看作数值型RDD
val rdd = sc.makeRDD(1 to 100)
rdd.map(1 to _).collect
res6: Array[scala.collection.immutable.Range.Inclusive] = Array(Range(1), Range(1, 2), Range(1, 2, 3), Range(1, 2, 3, 4), Range(1, 2, 3, 4, 5), Range(1, 2, 3, 4, 5, 6), Range(1, 2, 3, 4, 5, 6, 7), Range(1, 2, 3, 4, 5, 6, 7, 8), Range(1, 2, 3, 4, 5, 6, 7, 8, 9), Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12), Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13), Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14), Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ..
.flatMap(1 to _).collect
res7: Array[Int] = Array(1, 1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 1...
val rdd = sc.makeRDD(1 to 100)
rdd.mapPartitions(items => items.filter(_%3==0).map(_+"hello")).collect
res9: Array[String] = Array(3hello, 6hello, 9hello, 12hello, 15hello, 18hello, 21hello, 24hello, 27hello, 30hello, 33hello, 36hello, 39hello, 42hello, 45hello, 48hello, 51hello, 54hello, 57hello, 60hello, 63hello, 66hello, 69hello, 72hello, 75hello, 78hello, 81hello, 84hello, 87hello, 90hello, 93hello, 96hello, 99hello)
val rdd = sc.makeRDD(1 to 100,5)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[9] at makeRDD at <console>:24
rdd.partitions.size
res10: Int = 5
rdd.mapPartitionsWithIndex((i,items) => Iterator(i + ":["+items.mkString(",")+"]")).collect
res12: Array[String] = Array(0:[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], 1:[21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40], 2:[41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60], 3:[61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80], 4:[81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100])
rdd.partitioner
res13: Option[org.apache.spark.Partitioner] = None
//放回抽样
rdd.sample(true,0.3,5).collect
res14: Array[Int] = Array(7, 13, 13, 18, 28, 28, 29, 32, 34, 36, 45, 49, 54, 65, 66, 67, 68, 71, 72, 76, 78, 80, 80, 84, 87, 88, 90, 92, 96)
//求并集
sc.makeRDD(1 to 10).union(sc.makeRDD(10 to 20)).collect
res16: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)
//求交集
sc.makeRDD(1 to 10).intersection(sc.makeRDD(10 to 20)).collect
res17: Array[Int] = Array(10)
//去重
sc.makeRDD(1 to 10).union(sc.makeRDD(10 to 20)).distinct.collect
res18: Array[Int] = Array(4, 16, 14, 6, 8, 12, 18, 20, 10, 2, 13, 19, 15, 11, 1, 17, 3, 7, 9, 5)
val rdd1 =sc.makeRDD(Array((1,1),(2,2),(1,3),(2,7),(3,5)))
//将value相加
rdd1.reduceByKey(_+_).collect
res19: Array[(Int, Int)] = Array((1,4), (3,5), (2,9))
//将key相同的value聚集到一块
rdd1.groupByKey().collect
res21: Array[(Int, Iterable[Int])] = Array((1,CompactBuffer(1, 3)), (3,CompactBuffer(5)), (2,CompactBuffer(2, 7)))
val rdd2= sc.makeRDD(Array(("a",90),("a",80),("a",60),("b",78),("b",84),("b",96),("c",90),("c",86)))
combineByKey[C](
createCombiner : scala.Function1[V, C],
mergeValue : scala.Function2[C, V, C],
mergeCombiners : scala.Function2[C, C, C],
partitioner : org.apache.spark.Partitioner,
mapSideCombine : scala.Boolean = { /* compiled code */ },
serializer : org.apache.spark.serializer.Serializer = { /* compiled code */ }) : org.apache.spark.rdd.RDD[scala.Tuple2[K, C]] = { /* compiled code */ }
combineByKey[C](
createCombiner : scala.Function1[V, C],
mergeValue : scala.Function2[C, V, C],
mergeCombiners : scala.Function2[C, C, C],
partitioner : org.apache.spark.Partitioner,
//求总分总科目数
rdd2.combineByKey(
v => (v,1),
(c:(Int,Int),v) => (c._1 + v , c._2 +1),
(c1:(Int,Int),c2:(Int,Int)) => (c1._1 + c2._1,c1._2 + c2._2)
)
res26.collect
res27: Array[(String, (Int, Int))] = Array((a,(230,3)), (b,(258,3)), (c,(176,2)))
//求平均分
res27.map{case (k,v:(Int,Int)) => (k,v._1/v._2)}.collect
res33: Array[(String, Int)] = Array((a,76), (b,86), (c,88))
rdd.aggregateByKey(0)(math.max(_,_),_+_)
def combineByKey[]{
v =>math.max(0,v),
(c:Int,v) => math.max(c,v)
(c1:Int,c2:Int) => math.max(c1,c2)
}
val rdd= sc.makeRDD(1 to 10)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[1] at makeRDD at <console>:24
rdd.sortBy(_ % 4).collect
res8: Array[Int] = Array(4, 8, 1, 5, 9, 2, 6, 10, 3, 7)
scala> val rdd = sc.parallelize(List("hi","hello","how","are","you"),1)
rdd: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[5] at parallelize at <console>:24
scala> rdd.pipe("/root/pipe.sh").collect
res9: Array[String] = Array(AA, >>>hi, >>>hello, >>>how, >>>are, >>>you)
scala> val rdd =sc.makeRDD(1 to 10,2)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[7] at makeRDD at <console>:24
scala> rdd.glom.collect
res10: Array[Array[Int]] = Array(Array(1, 2, 3, 4, 5), Array(6, 7, 8, 9, 10))
scala> rdd.reduce(_+_)
res12: Int = 55
scala> rdd.count
res13: Long = 10
scala> rdd.first
res14: Int = 1
scala> rdd.take(5)
res15: Array[Int] = Array(1, 2, 3, 4, 5)
scala> rdd.takeOrdered(8)
res18: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8)