:scala版
package com.bbw5.dataalgorithms.spark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
/**
* This class provides a basic implementation of "left outer join"
* operation for a given two tables. This class is provided as
* an educational tool to understand the concept of "left outer join"
* functionality.
*
* users table(user_id,location_id)
* transactions table(transaction_id,product_id,user_id,quantity,amount)
*
* Note that Spark API does provide JavaPairRDD.leftOuterJoin() functionality.
*
* @author baibaw5
*
*/
object SparkLeftOuterJoin {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("SparkTop10UsingTakeOrdered")
val sc = new SparkContext(sparkConf)
val userFilename = "G:/temp/data/user.txt"
val tranFilename = "G:/temp/data/transaction.txt"
val userFile = sc.textFile(userFilename)
val tranFile = sc.textFile(tranFilename)
//(user_id,location_id)
val userRDD = userFile.map(_.split(",")).map(d => (d(0), ("L", d(1))))
//(user_id,product_id)
val tranRDD = tranFile.map(_.split(",")).map(d => (d(2), ("P", d(1))))
val groupRDD = tranRDD.union(userRDD).groupByKey()
groupRDD.foreach(println)
val plRDD = groupRDD.mapValues { iter =>
val location = iter.filter(p => p._1 == "L").toArray.apply(0)._2
iter.filter(p => p._1 == "P").map(p => (p._2, location))
}.flatMap(a => a._2)
plRDD.groupByKey().mapValues ( x => x.size ).foreach(println)
//version two
val userRDD2 = userFile.map(_.split(",")).map(d => (d(0), d(1)))
//(user_id,product_id)
val tranRDD2 = tranFile.map(_.split(",")).map(d => (d(2), d(1)))
val groupRDD2=tranRDD2.join(userRDD2)
groupRDD2.map(a=>a._2).groupByKey().mapValues ( x => x.size ).foreach(println)
}
}