【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter4 LeftOuterJoin

:scala版

package com.bbw5.dataalgorithms.spark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext

/**
 * This class provides a basic implementation of "left outer join"
 * operation for a given two tables.  This class is provided as
 * an educational tool to understand the concept of "left outer join"
 * functionality.
 *
 * users table(user_id,location_id)
 * transactions table(transaction_id,product_id,user_id,quantity,amount)
 *
 * Note that Spark API does provide JavaPairRDD.leftOuterJoin() functionality.
 *
 * @author baibaw5
 *
 */

object SparkLeftOuterJoin {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("SparkTop10UsingTakeOrdered")
    val sc = new SparkContext(sparkConf)

    val userFilename = "G:/temp/data/user.txt"
    val tranFilename = "G:/temp/data/transaction.txt"
    val userFile = sc.textFile(userFilename)
    val tranFile = sc.textFile(tranFilename)

    //(user_id,location_id)
    val userRDD = userFile.map(_.split(",")).map(d => (d(0), ("L", d(1))))
    //(user_id,product_id)
    val tranRDD = tranFile.map(_.split(",")).map(d => (d(2), ("P", d(1))))

    val groupRDD = tranRDD.union(userRDD).groupByKey()

    groupRDD.foreach(println)

    val plRDD = groupRDD.mapValues { iter =>
      val location = iter.filter(p => p._1 == "L").toArray.apply(0)._2
      iter.filter(p => p._1 == "P").map(p => (p._2, location))
    }.flatMap(a => a._2)

    plRDD.groupByKey().mapValues ( x => x.size ).foreach(println)

    //version two
    val userRDD2 = userFile.map(_.split(",")).map(d => (d(0), d(1)))
    //(user_id,product_id)
    val tranRDD2 = tranFile.map(_.split(",")).map(d => (d(2), d(1)))
    
    val groupRDD2=tranRDD2.join(userRDD2)
    groupRDD2.map(a=>a._2).groupByKey().mapValues ( x => x.size ).foreach(println)

    
  }
}


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值