SparkSQL之函数的操作

最新推荐文章于 2024-07-29 14:05:37 发布

维维weiwei

最新推荐文章于 2024-07-29 14:05:37 发布

阅读量1k

点赞数

分类专栏： Spark生态系统

本文链接：https://blog.csdn.net/tangshiweibbs/article/details/70254898

版权

Spark生态系统专栏收录该内容

24 篇文章 0 订阅

订阅专栏

package com.uplooking.bigdata.sql.p3

import com.uplooking.bigdata.utils.MySparkUtil
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Column, SQLContext}

/**
  * SparkSQL函数的操作
  */
object ScalaSparkSQLFunctionOps {
  def main(args: Array[String]): Unit = {
    val sc = MySparkUtil.getSparkContext("ScalaSparkSQLFunction", "local")
    //    val sqlContext = MySparkUtil.getSparkSQLContext(sc)
    val sqlContext = MySparkUtil.getSparkSQLContext(sc)
    //    commonFunctionOps(sqlContext)
    //    rownumberFunctionOps(sqlContext)
    //    rownumberFunctionOps1(sqlContext)

    udfOps(sqlContext)

    sc.stop()
  }

  /**
    * SparkSQL内置UDF函数的编写
    * 1、编写一个函数
    * 2、通过sqlContext.udf注册1中编写的函数
    * 3、直接调用该函数即可
    */
  def udfOps(sqlContext: SQLContext): Unit = {
    val jsonDF = sqlContext.read.json("E:/test/spark/sql/account.json")
    jsonDF.registerTempTable("account")

    /**
      * step 1 编写自定义函数
      */
    def myLen(str: String): Int = str.length

    /**
      * step 2 注册该函数
      * register函数需要制定参数类型---->泛型
      * 第一个是返回值的类型，后面才是输入参数的类型
      */
    sqlContext.udf.register[Int, String]("myLen", str => /*myLen(str)*/ str.length)

    /**
      * step 3 使用
      */
    val retDF = sqlContext.sql("select firstname, myLen(firstname) from account where balance > 30000")
    retDF.show()
  }


  /**
    * 分组topn
    */
  def rownumberFunctionOps1(sqlContext: SQLContext): Unit = {
    val jsonDF = sqlContext.read.json("E:/test/spark/sql/account.json")
    jsonDF.registerTempTable("account")
   
    //state firstname balance
    //    val rnDF = sqlContext.sql("select state, firstname, balance, row_number() over(partition by state order by balance desc) as rank from account having rank < 4")
    val rnDF = sqlContext.sql("select t.* from (select state, firstname, balance, row_number() over(partition by state order by balance desc) as rank from account) t where t.rank < 4")
    rnDF.show()
  }
  
  //开窗函数  ---->rowNumber---->分组排序、分组TopN
  def rownumberFunctionOps(sqlContext: SQLContext): Unit = {
      // 统计每一个州state中balance的排名情况，在此基础之上求出每个state中balance的前3名
      //因为我们要使用这个开窗函数，是Hive特有的，也就是说SQLContext，所以我们要用HiveContext
      
    val jsonDF = sqlContext.read.json("E:/test/spark/sql/account.json")
    jsonDF.registerTempTable("account")
    
    //state firstname balance
    val rnDF = sqlContext.sql("select state, firstname, balance, row_number() over(partition by state order by balance desc) as rank from account")

    rnDF.show()
  }

  def commonFunctionOps(sqlContext: SQLContext): Unit = {
    val jsonDF = sqlContext.read.json("E:/test/spark/sql/account.json")
    //account_number|address|age|balance|city|email| employer|firstname|gender|lastname|state
    jsonDF.show()

    /**
      * 统计有多少个人
      * 平均balance是多少
      * 统计每个state有多少人
      */
    val countNum = jsonDF.count()
    println("当前DF中的数据条数：" + countNum)
    //将jsonDF注册成为一张表
    jsonDF.registerTempTable("account")
    val asDF = sqlContext.sql("select avg(balance) as avg_balance, sum(balance) as sum_balance from account")
    asDF.show()
    //统计每个state有多少人
    val psDF = sqlContext.sql("select state, count(1) as state_person_num from account group by state")
    psDF.show()
  }
}