sparkRDD入门程序

最新推荐文章于 2023-08-02 10:08:17 发布

牛鼻子老赵

最新推荐文章于 2023-08-02 10:08:17 发布

阅读量235

点赞数

分类专栏：大数据文章标签： RDD spark sparkRDD

本文链接：https://blog.csdn.net/qq_36416053/article/details/82392902

版权

大数据专栏收录该内容

10 篇文章 0 订阅

订阅专栏

sc.makeRDD(0 to 10)

res0.collect

sc.parallelize(1 to 10)

res2.collect

sc.makeRDD(Array(1,2,3))

数值型RDD[Int] RDD[Int,(String,Int)]
键值对RDD(Int,String)
所有的键值对型RDD都可以看作数值型RDD

val rdd = sc.makeRDD(1 to 100)

rdd.map(1 to _).collect
res6: Array[scala.collection.immutable.Range.Inclusive] = Array(Range(1), Range(1, 2), Range(1, 2, 3), Range(1, 2, 3, 4), Range(1, 2, 3, 4, 5), Range(1, 2, 3, 4, 5, 6), Range(1, 2, 3, 4, 5, 6, 7), Range(1, 2, 3, 4, 5, 6, 7, 8), Range(1, 2, 3, 4, 5, 6, 7, 8, 9), Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12), Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13), Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14), Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16), Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17), Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18), Range(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ..

.flatMap(1 to _).collect
res7: Array[Int] = Array(1, 1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 1...

val rdd = sc.makeRDD(1 to 100)
rdd.mapPartitions(items => items.filter(_%3==0).map(_+"hello")).collect
res9: Array[String] = Array(3hello, 6hello, 9hello, 12hello, 15hello, 18hello, 21hello, 24hello, 27hello, 30hello, 33hello, 36hello, 39hello, 42hello, 45hello, 48hello, 51hello, 54hello, 57hello, 60hello, 63hello, 66hello, 69hello, 72hello, 75hello, 78hello, 81hello, 84hello, 87hello, 90hello, 93hello, 96hello, 99hello)

val rdd = sc.makeRDD(1 to 100,5)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[9] at makeRDD at <console>:24
rdd.partitions.size
res10: Int = 5

rdd.mapPartitionsWithIndex((i,items) => Iterator(i + ":["+items.mkString(",")+"]")).collect
res12: Array[String] = Array(0:[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], 1:[21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40], 2:[41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60], 3:[61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80], 4:[81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100])

rdd.partitioner
res13: Option[org.apache.spark.Partitioner] = None

//放回抽样
rdd.sample(true,0.3,5).collect
res14: Array[Int] = Array(7, 13, 13, 18, 28, 28, 29, 32, 34, 36, 45, 49, 54, 65, 66, 67, 68, 71, 72, 76, 78, 80, 80, 84, 87, 88, 90, 92, 96)

//求并集
sc.makeRDD(1 to 10).union(sc.makeRDD(10 to 20)).collect
res16: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)

//求交集
sc.makeRDD(1 to 10).intersection(sc.makeRDD(10 to 20)).collect
res17: Array[Int] = Array(10)

//去重
sc.makeRDD(1 to 10).union(sc.makeRDD(10 to 20)).distinct.collect
res18: Array[Int] = Array(4, 16, 14, 6, 8, 12, 18, 20, 10, 2, 13, 19, 15, 11, 1, 17, 3, 7, 9, 5)

val rdd1 =sc.makeRDD(Array((1,1),(2,2),(1,3),(2,7),(3,5)))
//将value相加
rdd1.reduceByKey(_+_).collect
res19: Array[(Int, Int)] = Array((1,4), (3,5), (2,9))
//将key相同的value聚集到一块
rdd1.groupByKey().collect
res21: Array[(Int, Iterable[Int])] = Array((1,CompactBuffer(1, 3)), (3,CompactBuffer(5)), (2,CompactBuffer(2, 7)))

val rdd2= sc.makeRDD(Array(("a",90),("a",80),("a",60),("b",78),("b",84),("b",96),("c",90),("c",86)))
combineByKey[C](
   createCombiner : scala.Function1[V, C],
   mergeValue : scala.Function2[C, V, C],
   mergeCombiners : scala.Function2[C, C, C],
   partitioner : org.apache.spark.Partitioner,
   mapSideCombine : scala.Boolean = { /* compiled code */ },
   serializer : org.apache.spark.serializer.Serializer = { /* compiled code */ }) : org.apache.spark.rdd.RDD[scala.Tuple2[K, C]] = { /* compiled code */ }

combineByKey[C](
   createCombiner : scala.Function1[V, C],
   mergeValue : scala.Function2[C, V, C],
   mergeCombiners : scala.Function2[C, C, C],
   partitioner : org.apache.spark.Partitioner,
//求总分总科目数
rdd2.combineByKey(
   v => (v,1),
   (c:(Int,Int),v) => (c._1 + v , c._2 +1),
   (c1:(Int,Int),c2:(Int,Int)) => (c1._1 + c2._1,c1._2 + c2._2)
)
res26.collect
res27: Array[(String, (Int, Int))] = Array((a,(230,3)), (b,(258,3)), (c,(176,2)))
//求平均分
res27.map{case (k,v:(Int,Int)) => (k,v._1/v._2)}.collect
res33: Array[(String, Int)] = Array((a,76), (b,86), (c,88))

rdd.aggregateByKey(0)(math.max(_,_),_+_)
def combineByKey[]{
v =>math.max(0,v),
(c:Int,v) => math.max(c,v)
(c1:Int,c2:Int) => math.max(c1,c2)
}

val rdd= sc.makeRDD(1 to 10)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[1] at makeRDD at <console>:24

rdd.sortBy(_ % 4).collect
res8: Array[Int] = Array(4, 8, 1, 5, 9, 2, 6, 10, 3, 7)

scala> val rdd = sc.parallelize(List("hi","hello","how","are","you"),1)
rdd: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[5] at parallelize at <console>:24

scala> rdd.pipe("/root/pipe.sh").collect
res9: Array[String] = Array(AA, >>>hi, >>>hello, >>>how, >>>are, >>>you)

scala> val rdd =sc.makeRDD(1 to 10,2)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[7] at makeRDD at <console>:24

scala> rdd.glom.collect
res10: Array[Array[Int]] = Array(Array(1, 2, 3, 4, 5), Array(6, 7, 8, 9, 10))

scala> rdd.reduce(_+_)
res12: Int = 55

scala> rdd.count
res13: Long = 10

scala> rdd.first
res14: Int = 1

scala> rdd.take(5)
res15: Array[Int] = Array(1, 2, 3, 4, 5)

scala> rdd.takeOrdered(8)
res18: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8)

牛鼻子老赵

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
sparkRDD入门程序

sc.makeRDD(0 to 10)res0.collectsc.parallelize(1 to 10)res2.collectsc.makeRDD(Array(1,2,3))数值型RDD[Int] RDD[Int,(String,Int)]键值对RDD(Int,String)所有的键值对型RDD都可以看作数值型RDDval rdd = sc.makeRDD(1 t...
复制链接

扫一扫