13、Spark_RDD算子——CombineByKey_GroupByKey转换

一、SparkUtils工具类

import org.apache.spark.{SparkConf, SparkContext}

object SparkUtils {
  /**
   *  默认的master url路径
   */
  val DEFAULT_MASTER = "local[*]"
  /**
   * 默认master为local[*]的获取sparkContext
   */
  def getSparkContext(appName:String):SparkContext = getSparkContext(appName, DEFAULT_MASTER)
  def getSparkContext(appName:String, master:String):SparkContext = new SparkContext(new SparkConf().setAppName(appName).setMaster(master))
  /**
   * 释放sparkContext
   */
  def close(sc:SparkContext) = if(sc != null) sc.stop()
}

二、日志工具

import org.apache.log4j.{Level, Logger}

trait LoggerTrait {
  Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
  Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
  Logger.getLogger("org.spark_project").setLevel(Level.WARN)

}

三、Spark算子CombineByKey_GroupByKey转换


import cn.qphone.spark.common.LoggerTrait.LoggerTrait
import cn.qphone.spark.common.Utils.SparkUtils
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

import scala.collection.mutable.ArrayBuffer

object Deom13_CombineByKey_GroupByKey extends LoggerTrait {
  def main(args: Array[String]): Unit = {
    //1.sparkcontext获取
    val sc = SparkUtils.getSparkContext("Deom13_CombineByKey_GroupByKey ")
    //2.数据
    cbk2gbk(sc)

    //6.释放资源
    SparkUtils.close(sc)
  }

  def cbk2gbk(sc: SparkContext): Unit = {
    //1.数据
    val stuList = List(
      "令狐冲 华山派",
      "岳不群 华山派",
      "虚竹 逍遥派",
      "乔峰 丐帮",
      "黄蓉 桃花岛",
      "杨过 古墓派",
      "小龙女 古墓派",
      "郭靖 丐帮"
    )
    //2.改造数据
    val stuRDD: RDD[String] = sc.parallelize(stuList, 3)
    //    val stusRDD: RDD[(String, String)] = stuRDD.map(line => {
    //      val index = line.lastIndexOf(" ")
    //      val classname = line.substring(index + 1)
    //      val info = line.substring(0, index)
    //      (classname, info)
    //    })
    val stusRDD: RDD[(String, String)] = stuRDD.mapPartitionsWithIndex{
      case (partitionId,iterator) =>{
        val array = iterator.toArray
        println(s"${partitionId},${array.mkString("[",",","]")}")
        array.map(line =>{
          val index = line.lastIndexOf(" ")
          val classname = line.substring(index + 1)
          val info = line.substring(0, index)
          (classname, info)

        }).iterator
      }
    }

    //3.combineByKey
    stusRDD.combineByKey(createCombiner, mergeValue, mergeCombiners).foreach(println)
  }

  def createCombiner(str: String): ArrayBuffer[String] = {
    println("==================createCombiner<Arraybuffer = " + str + ">====================")
    val arrayBuffer = ArrayBuffer[String]()
    arrayBuffer.append(str)
    arrayBuffer
  }

  def mergeValue(arrayBuffer: ArrayBuffer[String], str: String): ArrayBuffer[String] = {
    println("==================mergeValue 分区类的局部聚合<Arraybuffer = " + arrayBuffer + ",str = " + str + ">====================")
    arrayBuffer.append(str)
    arrayBuffer
  }

  def mergeCombiners(ab1: ArrayBuffer[String], ab2: ArrayBuffer[String]): ArrayBuffer[String] = {
    println("==================mergeValue 全局聚合<<全局聚合变量 = " + ab1 + ",局部聚合之后数据 = " + ab2 + ">====================")
    ab1.++(ab2)
    ab1
  }
}

已标记关键词 清除标记
©️2020 CSDN 皮肤主题: 游动-白 设计师:上身试试 返回首页