package com.uplooking.bigdata.sql.p3 import com.uplooking.bigdata.utils.MySparkUtil import org.apache.spark.SparkConf import org.apache.spark.sql.{Column, SQLContext} /** * SparkSQL函数的操作 */ object ScalaSparkSQLFunctionOps { def main(args: Array[String]): Unit = { val sc = MySparkUtil.getSparkContext("ScalaSparkSQLFunction", "local") // val sqlContext = MySparkUtil.getSparkSQLContext(sc) val sqlContext = MySparkUtil.getSparkSQLContext(sc) // commonFunctionOps(sqlContext) // rownumberFunctionOps(sqlContext) // rownumberFunctionOps1(sqlContext) udfOps(sqlContext) sc.stop() } /** * SparkSQL内置UDF函数的编写 * 1、编写一个函数 * 2、通过sqlContext.udf注册1中编写的函数 * 3、直接调用该函数即可 */ def udfOps(sqlContext: SQLContext): Unit = { val jsonDF = sqlContext.read.json("E:/test/spark/sql/account.json") jsonDF.registerTempTable("account") /** * step 1 编写自定义函数 */ def myLen(str: String): Int = str.length /** * step 2 注册该函数 * register函数需要制定参数类型---->泛型 * 第一个是返回值的类型,后面才是输入参数的类型 */ sqlContext.udf.register[Int, String]("myLen", str => /*myLen(str)*/ str.length) /** * step 3 使用 */ val retDF = sqlContext.sql("select firstname, myLen(firstname) from account where balance > 30000") retDF.show() } /** * 分组topn */ def rownumberFunctionOps1(sqlContext: SQLContext): Unit = { val jsonDF = sqlContext.read.json("E:/test/spark/sql/account.json") jsonDF.registerTempTable("account") //state firstname balance // val rnDF = sqlContext.sql("select state, firstname, balance, row_number() over(partition by state order by balance desc) as rank from account having rank < 4") val rnDF = sqlContext.sql("select t.* from (select state, firstname, balance, row_number() over(partition by state order by balance desc) as rank from account) t where t.rank < 4") rnDF.show() } //开窗函数 ---->rowNumber---->分组排序、分组TopN def rownumberFunctionOps(sqlContext: SQLContext): Unit = { // 统计每一个州state中balance的排名情况,在此基础之上求出每个state中balance的前3名 //因为我们要使用这个开窗函数,是Hive特有的,也就是说SQLContext,所以我们要用HiveContext val jsonDF = sqlContext.read.json("E:/test/spark/sql/account.json") jsonDF.registerTempTable("account") //state firstname balance val rnDF = sqlContext.sql("select state, firstname, balance, row_number() over(partition by state order by balance desc) as rank from account") rnDF.show() } def commonFunctionOps(sqlContext: SQLContext): Unit = { val jsonDF = sqlContext.read.json("E:/test/spark/sql/account.json") //account_number|address|age|balance|city|email| employer|firstname|gender|lastname|state jsonDF.show() /** * 统计有多少个人 * 平均balance是多少 * 统计每个state有多少人 */ val countNum = jsonDF.count() println("当前DF中的数据条数:" + countNum) //将jsonDF注册成为一张表 jsonDF.registerTempTable("account") val asDF = sqlContext.sql("select avg(balance) as avg_balance, sum(balance) as sum_balance from account") asDF.show() //统计每个state有多少人 val psDF = sqlContext.sql("select state, count(1) as state_person_num from account group by state") psDF.show() } }
SparkSQL之函数的操作
最新推荐文章于 2024-07-29 14:05:37 发布