1.sortBy
val rdd3 = sc.parallelize(List(1,2,3,4,19,29,10))
rdd3.map(_*2).sortBy(x=>x).collect
rdd3.map(_*2).sortBy(x=>x,false).collect
2.filter
val rdd3 = sc.parallelize(List(1,2,3,4,19,29,10))
rdd3.map(_*2).filter(_>10).collect (过滤)
3.flatMap(压平)
val rdd4 = sc.parallelize(Array("a b c","d e f g"))
rdd4.flatMap(_.split(" ")).collect
4.sample(抽样)
val rdd = sc.parallelize(1 to 10)
val sample = rdd.sample(false,0.5).collect (0.25表示抽取百分之多少,但不准确)
5.union 、intersection 、distinct
val rdd = sc.parallelize(1 to 10)
val rdd2 = sc.parallelize(6 to 12)
(rdd union rdd2).collect (并集)
(rdd intersection rdd2).collect (交集)
rdd2.distinct.collect.toBuffer() (去重)
6.join连接
val rdd1 = sc.parallelize(List(("tom",18),("tom1",18),("tom2",18),("tom3",18)))
val rdd2 = sc.parallelize(List(("tom",18),("tom1",18),("tom5",18),("tom4",18)))
join
(rdd1 join rdd2).collect
结果1
res7: Array[(String, (Int, Int))] = Array((tom,(18,18)), (tom1,(18,18)))
leftOuterJoin
(rdd1 leftOuterJoin rdd2).collect
结果2
res10: Array[(String, (Int, Option[Int]))] = Array((tom2,(18,None)), (tom,(18,Some(18))), (tom1,(18,Some(18))), (tom3,(18,None)))
rightOuterJoin
(rdd1 rightOuterJoin rdd2).collect
结果3
res11: Array[(String, (Option[Int], Int))] = Array((tom,(Some(18),18)), (tom4,(None,18)), (tom1,(Some(18),18)), (tom5,(None,18)))
7.groupBy、cogroup
val rdd1 = sc.parallelize(List(("tom",18),("tom1",18),("tom2",18),("tom3",18)))
rdd1.groupBy(_._1).collect
val rdd1 = sc.parallelize(List(("tom",18),("tom1",18),("tom2",18),("tom3",18)))
val rdd2 = sc.parallelize(List(("tom",18),("tom1",18),("tom5",18),("tom4",18)))
(rdd1 cogroup rdd2).collect
结果2
res76: Array[(String, (Iterable[Int], Iterable[Int]))] =
Array((tom2,(CompactBuffer(18),CompactBuffer())),
(tom,(CompactBuffer(18),CompactBuffer(18))),
(tom4,(CompactBuffer(),CompactBuffer(18))),
(tom1,(CompactBuffer(18),CompactBuffer(18))),
(tom3,(CompactBuffer(18),CompactBuffer())),
(tom5,(CompactBuffer(),CompactBuffer(18))))
8.reduceByKey
rdd1.reduceByKey(_+_).collect
reduceByKey先进行局部聚合,后全局聚合
groupBy不进行集合,之后全局聚合
9.partitions.size
val rdd1 = sc.parallelize(List(("tom",18),("tom1",18),("tom2",18),("tom3",18)),3)
rdd1.partitions.size
10.mapPartitions
val rdd1 = sc.parallelize(List(1,2,3,4),3)
rdd1.partitions.size
rdd1.mapPartitions(_.map(_*10)).collect
map和mapPartitions区别:
map是将func作用于每一个元素上,而mapPartitions是将func作用于每个分区上
应用场景:如果该RDD数据集的数据不是很多的情况下,可以用map处理。
数据比较多时,使用mapPartition可以提高计算效率
11.coalesce改变分区数
coalesce(numPartitions :Int,shuffle: Boolean=false)
少的分区改成多分区要重新shuffle,要true参数
多的分区改成少的分区,不需要shuffle,默认是false
12.PartitionBy
要重新写一个分区规则Partitioner
PartitionBy(partitioner : Partitioner) :JavaPairRDD[k,v]
val rdd1 = sc.parallelize(List(("tom",18),("tom1",18),("tom2",18),("tom3",18)),3)
rdd1.partitionBy(new org.apache.spark.HashPartitioner(5))
结果
res22: org.apache.spark.rdd.RDD[(String, Int)] =
ShuffledRDD[32] at partitionBy at <console>:26
rdd1.partitionBy(new org.apache.spark.HashPartitioner(5)).partitions.length
13.repartition
rdd.repartition(7)
rdd.repartitionAndSortWithPartitions(new HashPartitioner(1))
14.aggregate(柯里化)
val rdd1 = sc.parallelize(List("a","b","c","d","e","f"),2)
rdd1.aggregate("")(_+_,_+_)
结果1
res25: String = abcdef //第一次运行结果
res26: String = defabc //第二次运行结果//分区执行顺序影响
scala> rdd1.aggregate("+")(_+_,_+_)
结果2
res27: String = ++def+abc //第一次运行结果
res30: String = ++abc+def//第二次运行结果
val rdd2 = sc.parallelize(List("12","123","1234","123456"),2)
rdd2.aggregate("")((x,y)=>math.max(x.length,y.length).toString,(x,y)=>x+y)
结果3
res32: String = 36//第一次运行结果
res34: String = 63//第二次运行结果