如果出现一些函数不能用导入包
import org.apache.spark.api.java
union
++ 等价于 union 将两个rdd做并集
val rdd1 = sc.parallelize(List(1,2,3,4,5))
val rdd2 = sc.parallelize(List(6,7,8,9,10))
Val rdd3 = rdd1 ++ rdd2
结果就是 rdd(1,2,3,4,5,6,7,8,9,10)
分区个数= rdd1.分区数 + rdd2.分区数
intersection()求两个RDD交集
cartesian() 两个RDD做笛卡儿积
scala> val t1=sc.makeRDD(List("cat1","dog1"))
scala> val t2=sc.makeRDD(List("cat2","dog2"))
scala> t1.cartesian(t2)
res9: org.apache.spark.rdd.RDD[(String, String)] = CartesianRDD[7] at cartesian at <console>:28
scala> res9.collect
res10: Array[(String, String)] = Array((cat1,cat2), (cat1,dog2), (dog1,cat2), (dog1,dog2))
cogroup():整合两个RDD
val rdd1 = sc.parallelize(List(("cat", 1), ("dog", 1), ("cat", 3)))
val rdd2 = sc.parallelize(List(("cat", 2), ("dog", 2)))
rdd1.cogroup(rdd2)
结果是:
Array((cat,(CompactBuffer(3,1),CompactBuffer(2))), (dog,(CompactBuffer(1),CompactBuffer(2))))
join(内连接)
val rdd1 = sc.parallelize(List(("cat", 1), ("dog", 1), ("cat", 3)))
val rdd2 = sc.parallelize(List(("cat", 2), ("dog", 2), ("tiger", 2)))
rdd1.join(rdd2) // 内连接
将两个rdd集合中key相同的元素连接在一起
结果为:
Array((dog,(1,2)), (cat,(1,2)), (cat,(3,2)))
leftOuterJoin:左外连接 rightOuterJoin():右外连接
scala> val rdd1 = sc.parallelize(List(("cat", 1), ("dog", 1), ("cat", 3),("wolf",5)))
scala> val rdd2 = sc.parallelize(List(("cat", 2), ("dog", 2), ("tiger", 2)))
scala> rdd1.leftOuterJoin(rdd2).collect
res62: Array[(String, (Int, Option[Int]))] = Array((dog,(1,Some(2))), (wolf,(5,None)), (cat,(1,Some(2))), (cat,(3,Some(2))))
scala> rdd1.rightOuterJoin(rdd2).collect
import org.apache.spark.api.java
union
++ 等价于 union 将两个rdd做并集
val rdd1 = sc.parallelize(List(1,2,3,4,5))
val rdd2 = sc.parallelize(List(6,7,8,9,10))
Val rdd3 = rdd1 ++ rdd2
结果就是 rdd(1,2,3,4,5,6,7,8,9,10)
分区个数= rdd1.分区数 + rdd2.分区数
intersection()求两个RDD交集
cartesian() 两个RDD做笛卡儿积
scala> val t1=sc.makeRDD(List("cat1","dog1"))
scala> val t2=sc.makeRDD(List("cat2","dog2"))
scala> t1.cartesian(t2)
res9: org.apache.spark.rdd.RDD[(String, String)] = CartesianRDD[7] at cartesian at <console>:28
scala> res9.collect
res10: Array[(String, String)] = Array((cat1,cat2), (cat1,dog2), (dog1,cat2), (dog1,dog2))
cogroup():整合两个RDD
val rdd1 = sc.parallelize(List(("cat", 1), ("dog", 1), ("cat", 3)))
val rdd2 = sc.parallelize(List(("cat", 2), ("dog", 2)))
rdd1.cogroup(rdd2)
结果是:
Array((cat,(CompactBuffer(3,1),CompactBuffer(2))), (dog,(CompactBuffer(1),CompactBuffer(2))))
join(内连接)
val rdd1 = sc.parallelize(List(("cat", 1), ("dog", 1), ("cat", 3)))
val rdd2 = sc.parallelize(List(("cat", 2), ("dog", 2), ("tiger", 2)))
rdd1.join(rdd2) // 内连接
将两个rdd集合中key相同的元素连接在一起
结果为:
Array((dog,(1,2)), (cat,(1,2)), (cat,(3,2)))
leftOuterJoin:左外连接 rightOuterJoin():右外连接
scala> val rdd1 = sc.parallelize(List(("cat", 1), ("dog", 1), ("cat", 3),("wolf",5)))
scala> val rdd2 = sc.parallelize(List(("cat", 2), ("dog", 2), ("tiger", 2)))
scala> rdd1.leftOuterJoin(rdd2).collect
res62: Array[(String, (Int, Option[Int]))] = Array((dog,(1,Some(2))), (wolf,(5,None)), (cat,(1,Some(2))), (cat,(3,Some(2))))
scala> rdd1.rightOuterJoin(rdd2).collect
res64: Array[(String, (Option[Int], Int))] = Array((dog,(Some(1),2)), (cat,(Some(1),2)), (cat,(Some(3),2)), (tiger,(None,2)))
欢迎加qq1204738320交流