需求:做两个RDD的join操作,用广播变量实现。
object broadcastApp{
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
val sc = new SparkContext(conf)
val rdd1 = sc.parallelize(Array(("10","AA"),("11","BB"))).collectAsMap()
val rdd2 = sc.parallelize(Array(("10","十"),("11","十一"),("12","十二")))
val rdd1_bc = sc.broadcast(rdd1) //将rdd1广播到每台Work Node
rdd2.map(x=>(x._1,x)).mapPartitions(x => {
val bc_value = rdd1_bc.value //取出广播变量
for((k,v)<- x if(bc_value.contains(k))) yield (k, bc_value.get(k).getOrElse(""), v._2)})
.collect.foreach(println)
sc.stop()
}
}
输出结果:
(10,AA,十)
(11,BB,十一)