spark join的几种方式
- join
- leftOuterJoin
- fullOuterJoin
- rightOuterJoin
准备数据
val a =sc.parallelize(Array("a","b","d","f")).map((_,1))
val b =sc.parallelize(Array("a","b","e","g")).map((_,1))
测试结果
a.join(b).collect 返回ab中都有的
Array[(String, (Int, Int))] = Array((b,(1,1)), (a,(1,1)))
a.leftOuterJoin(b).collect a为主数据,匹配b,没有则为None
Array[(String, (Int, Option[Int]))] = Array((d,(1,None)), (b,(1,Some(1))), (f,(1,None)), (a,(1,Some(1))))
a.rightOuterJoin(b).collect b为主数据,匹配a,没有则为None
Array[(String, (Option[Int], Int))] = Array((b,(Some(1),1)), (e,(None,1)), (a,(Some(1),1)), (g,(None,1)))
a.fullOuterJoin(b).collect 返回a b 的全部数据,匹配不上则为 None
Array[(String, (Option[Int], Option[Int]))] = Array((d,(Some(1),None)), (b,(Some(1),Some(1))), (f,(Some(1),None)), (e,(None,Some(1))), (a,(Some(1),Some(1))), (g,(None,Some(1))))
集合交集,差集
- subtract()
- intersection()
- cartesian()
val a =sc.parallelize(Array("a","b","d","f"))
val b =sc.parallelize(Array("a","b","e","g"))
scala> a.subtract(b).collect() 把a中数据B有的去除掉
res15: Array[String] = Array(d, f)
scala> a.intersection(b).collect() 取a,b 中都有的数据
res16: Array[String] = Array(b, a)
scala> a.cartesian(b).collect() 取a,b的笛卡尔积
res17: Array[(String, String)] = Array((a,a), (a,b), (b,a), (b,b), (a,e), (a,g), (b,e), (b,g), (d,a), (d,b), (f,a), (f,b), (d,e), (d,g), (f,e), (f,g))