ScalaSQL篇

IDEA 开发 SparkSQL

添加依赖

<dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-sql_2.12</artifactId>
        <version>3.0.0</version>
</dependency>

 代码实现

package com.atguigu.bigdata.spark.sql

import javafx.scene.input.DataFormat
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object Spark01_SparkSQL_Basic {
  def main(args: Array[String]): Unit = {
    //创建上下文环境配置对象
    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkSQL")
    val spark = SparkSession.builder().config(sparkConf).getOrCreate()
    import spark.implicits._
//    val df:DataFrame = spark.read.json("E:/IDEA 2021/works/atguigu-classes/datas/user.json")

//    df.show()

//    df.createOrReplaceTempView("user")
//    spark.sql("select * from user").show()
//    spark.sql("select age, username from user").show()
//    spark.sql("select avg(age) from user").show()


//    df.select("age","username").show()
//    df.select($"age" + 1).show
//    df.select('age + 1).show

//    val seq = Seq(1,2,3,4)
//    val ds:Dataset[Int] = seq.toDS()
//    ds.show()

    val rdd = spark.sparkContext.makeRDD(List((1, "zhangsan", 30),(2,"lisi",40)))
    val df:DataFrame = rdd.toDF("id","name","age")
    val rowRDD: RDD[Row] = df.rdd

    val ds:Dataset[User] = df.as[User]
    val df1:DataFrame = ds.toDF()

    val ds1:Dataset[User] = rdd.map {
      case (id, name, age) => {
        User(id, name, age)
      }
    }.toDS()
    val userRDD:RDD[User] = ds1.rdd


    
    spark.close()
  }
  case class User(id:Int,name:String,age:Int)
}

UDAF

强类型的 Dataset 和弱类型的 DataFrame 都提供了相关的聚合函数, 如 count(),
countDistinct(),avg(),max(),min()。除此之外,用户可以设定自己的自定义聚合函数。通
过继承 UserDefinedAggregateFunction 来实现用户自定义弱类型聚合函数。从 Spark3.0 版本
后,UserDefinedAggregateFunction 已经不推荐使用了。可以统一采用强类型聚合函数
Aggregator

自定义聚合函数类:

继承UserDefinedAggregateFunction方法(该方法已弃用)

package com.atguigu.bigdata.spark.sql

import org.apache.commons.digester.SetTopRule
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Row, SparkSession, types}
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{DataType, LongType, StructField, StructType}

object Spark03_SparkSQL_UDAF {
  def main(args: Array[String]): Unit = {
    //创建上下文环境配置对象
    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkSQL")
    val spark = SparkSession.builder().config(sparkConf).getOrCreate()

    val df = spark.read.json("datas/user.json")
    df.createOrReplaceTempView("user")
    spark.udf.register("ageAvg",new MyAvgUDAF())
    spark.sql("select ageAvg(age) from user").show()
    spark.close()
  }

  class MyAvgUDAF extends UserDefinedAggregateFunction{

    override def inputSchema: StructType = {
      StructType(
        Array(
          StructField("age",LongType)
        )
      )
    }

    override def bufferSchema: StructType = {
      StructType(
        Array(
          StructField("total",LongType),
          StructField("count",LongType)
        )
      )
    }

    override def dataType: DataType = LongType

    override def deterministic: Boolean = true

    override def initialize(buffer: MutableAggregationBuffer): Unit = {
//      buffer(0) = 0L
//      buffer(1) = 0L

      buffer.update(0,0L)
      buffer.update(1,0L)
    }

    override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
      buffer.update(0, buffer.getLong(0)+input.getLong(0))
      buffer.update(1, buffer.getLong(1)+1)
    }

    override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
      buffer1.update(0, buffer1.getLong(0) + buffer2.getLong(0))
      buffer1.update(1, buffer1.getLong(1) + buffer2.getLong(1))
    }

    override def evaluate(buffer: Row): Any = {
      buffer.getLong(0)/buffer.getLong(1)
    }
  }

}

继承Aggregator

package com.atguigu.bigdata.spark.sql

import org.apache.spark.{SparkConf}
import org.apache.spark.sql.expressions.{MutableAggregationBuffer}
import org.apache.spark.sql.types.{DataType, LongType, StructField, StructType}
import org.apache.spark.sql.{Encoder, Encoders, Row, SparkSession, functions}
import org.apache.spark.sql.expressions.Aggregator

object Spark03_SparkSQL_UDAF1 {
  def main(args: Array[String]): Unit = {
    //创建上下文环境配置对象
    val sparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkSQL")
    val spark = SparkSession.builder().config(sparkConf).getOrCreate()

    val df = spark.read.json("datas/user.json")
    df.createOrReplaceTempView("user")
    spark.udf.register("ageAvg",functions.udaf(new MyAvgUDAF()))
    spark.sql("select ageAvg(age) from user").show
    spark.close()
  }

  case class Buff(var total:Long, var count:Long)
  class MyAvgUDAF extends Aggregator[Long,Buff,Long] {
    override def zero: Buff = {
      Buff(0L,0L)
    }

    override def reduce(buff: Buff, in: Long): Buff = {
      buff.total = buff.total + in
      buff.count = buff.count + 1
      buff
    }

    override def merge(buff1: Buff, buff2: Buff): Buff = {
      buff1.total = buff1.total + buff2.total
      buff1.count = buff1.count + buff2.count
      buff1
    }

    override def finish(reduction: Buff): Long = {
      reduction.total / reduction.count
    }

    override def bufferEncoder: Encoder[Buff] = Encoders.product

    override def outputEncoder: Encoder[Long] = Encoders.scalaLong
  }

}

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

TGKD DFN

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值