左外连接三种方式
要求:查找已售出商品唯一位置数(售出商品出现位置的次数)
数据:
用户表 users.tsv
user_id location_id
u1 UT
u2 GA
u3 CA
u4 CA
u5 GA
交易信息 transactions.tsv
trasaction_id product_id user_id quantity amount
t1 p3 u1 1 300
t2 p1 u2 1 100
t3 p1 u1 1 100
t4 p2 u2 1 10
t5 p4 u4 1 9
t6 p1 u1 1 100
t7 p4 u1 1 9
t8 p4 u5 2 40
第一种方式,使用 union
object LeftOuterJoin {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[*]").setAppName("LeftOuterJoin")
val sc = new SparkContext(conf)
val usersRaw = sc.textFile("users.tsv")
val transactionsRaw = sc.textFile("transactions.tsv")
val users = usersRaw.map(l => {
val tokens = l.split("\t")
(tokens(0), ("L", tokens(1)))
})
val transactions = transactionsRaw.map(l => {
val tokens = l.split("\t")
(tokens(2), ("P", tokens(1)))
})
val all = users union transactions
all.groupByKey()
.flatMap{ case (_, itr) => {
val (location, products) = itr.span(_._1 == "L")
val loc = location.headOption.getOrElse(("L", "UNKNOWN"))
products.filter(_._1 == "P").map(p => (p._2, loc._2)).toSet
}}
.groupByKey()
.map(t =>