一、SparkUtils工具类
import org.apache.spark.{SparkConf, SparkContext}
object SparkUtils {
/**
* 默认的master url路径
*/
val DEFAULT_MASTER = "local[*]"
/**
* 默认master为local[*]的获取sparkContext
*/
def getSparkContext(appName:String):SparkContext = getSparkContext(appName, DEFAULT_MASTER)
def getSparkContext(appName:String, master:String):SparkContext = new SparkContext(new SparkConf().setAppName(appName).setMaster(master))
/**
* 释放sparkContext
*/
def close(sc:SparkContext) = if(sc != null) sc.stop()
}
二、日志工具
import org.apache.log4j.{Level, Logger}
trait LoggerTrait {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.spark_project").setLevel(Level.WARN)
}
三、Spark算子CombineByKey_GroupByKey转换
import cn.qphone.spark.common.LoggerTrait.LoggerTrait
import cn.qphone.spark.common.Utils.SparkUtils
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import scala.collection.mutable.ArrayBuffer
object Deom13_CombineByKey_GroupByKey extends LoggerTrait {
def main(args: Array[String]): Unit = {
//1.sparkcontext获取
val sc = SparkUtils.getSparkContext("Deom13_CombineByKey_GroupByKey ")
//2.数据
cbk2gbk(sc)
//6.释放资源
SparkUtils.close(sc)
}
def cbk2gbk(sc: SparkContext): Unit = {
//1.数据
val stuList = List(
"令狐冲 华山派",
"岳不群 华山派",
"虚竹 逍遥派",
"乔峰 丐帮",
"黄蓉 桃花岛",
"杨过 古墓派",
"小龙女 古墓派",
"郭靖 丐帮"
)
//2.改造数据
val stuRDD: RDD[String] = sc.parallelize(stuList, 3)
// val stusRDD: RDD[(String, String)] = stuRDD.map(line => {
// val index = line.lastIndexOf(" ")
// val classname = line.substring(index + 1)
// val info = line.substring(0, index)
// (classname, info)
// })
val stusRDD: RDD[(String, String)] = stuRDD.mapPartitionsWithIndex{
case (partitionId,iterator) =>{
val array = iterator.toArray
println(s"${partitionId},${array.mkString("[",",","]")}")
array.map(line =>{
val index = line.lastIndexOf(" ")
val classname = line.substring(index + 1)
val info = line.substring(0, index)
(classname, info)
}).iterator
}
}
//3.combineByKey
stusRDD.combineByKey(createCombiner, mergeValue, mergeCombiners).foreach(println)
}
def createCombiner(str: String): ArrayBuffer[String] = {
println("==================createCombiner<Arraybuffer = " + str + ">====================")
val arrayBuffer = ArrayBuffer[String]()
arrayBuffer.append(str)
arrayBuffer
}
def mergeValue(arrayBuffer: ArrayBuffer[String], str: String): ArrayBuffer[String] = {
println("==================mergeValue 分区类的局部聚合<Arraybuffer = " + arrayBuffer + ",str = " + str + ">====================")
arrayBuffer.append(str)
arrayBuffer
}
def mergeCombiners(ab1: ArrayBuffer[String], ab2: ArrayBuffer[String]): ArrayBuffer[String] = {
println("==================mergeValue 全局聚合<<全局聚合变量 = " + ab1 + ",局部聚合之后数据 = " + ab2 + ">====================")
ab1.++(ab2)
ab1
}
}