join优化应该是spark相关岗位面试必考的内容。 join其实常见的就分为两类: map-side join 和 reduce-side join。当大表和小表join时,用map-side join能显著提高效率。。
/**
* Created by shenjiyi on 2015/7/8.
*/
package com.test
import com.test.utils.MySparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
object TestJoin {
def main (args: Array[String]): Unit ={
val conf = new SparkConf()
.setMaster(args(0))
.setAppName("TestJoin")
.set("spark.speculation", "true")
.set("spark.default.parallelism", "200")
val sc = new MySparkContext(conf)
val input1 = sc.rawTextFile(args(1), "GB18030")
val input2 = sc.rawTextFile(args(2), "GB18030")
val output1 = args(3)
val output2 = arg