一、SparkUtils工具类
import org.apache.spark.{SparkConf, SparkContext}
object SparkUtils {
/**
* 默认的master url路径
*/
val DEFAULT_MASTER = "local[*]"
/**
* 默认master为local[*]的获取sparkContext
*/
def getSparkContext(appName:String):SparkContext = getSparkContext(appName, DEFAULT_MASTER)
def getSparkContext(appName:String, master:String):SparkContext = new SparkContext(new SparkConf().setAppName(appName).setMaster(master))
/**
* 释放sparkContext
*/
def close(sc:SparkContext) = if(sc != null) sc.stop()
}
二、日志工具
import org.apache.log4j.{Level, Logger}
trait LoggerTrait {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.spark_project").setLevel(Level.WARN)
}
三、Spark算子CombineByKey_ReduceByKey转换
import cn.qphone.spark.common.LoggerTrait.LoggerTrait
import cn.qphone.spark.common.Utils.SparkUtils
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
object Deom14_CombineByKey_ReduceByKey extends LoggerTrait {
def main(args: Array[String]): Unit = {
//1.sparkcontext获取
val sc = SparkUtils.getSparkContext("Deom14_CombineByKey_ReduceByKey")
//2.数据
cbk2rbk(sc)
//6.释放资源
SparkUtils.close(sc)
}
def cbk2rbk(sc:SparkContext):Unit = {
val list:List[String] = List(
"i am a big big boy",
"you are a abag girl"
)
val listRDD: RDD[String] = sc.parallelize(list, 3)
val mapRDD: RDD[(String, Int)] = listRDD.flatMap(_.split("\\s+")).map((_, 1))
val value: RDD[(String, Int)] = mapRDD.combineByKey(createCombiner, mergeValue, mergeCombiners)
value.foreach(println)
}
def createCombiner(num: Int):Int = {
println("==================createCombiner<num = " + num + ">====================")
num
}
def mergeValue(sum:Int,num:Int):Int = {
println("==================mergeValue 分区类的局部聚合<sum = " + sum + ",num = " + num + ">====================")
sum + num
}
def mergeCombiners(sum1:Int,sum2:Int): Int = {
println("==================mergeValue 全局聚合<<全局聚合变量 = " + sum1 + ",局部聚合之后数据 = " + sum2 + ">====================")
sum1 + sum2
}
}