df.createGlobalTempView() // 对DF创建全局的临时视图,它产生的表,可以多个spark session共享,它的生命周期和spark application绑定
df.createTempView() // 对DF创建局部的临时视图,它产生的表,仅供创建spark session使用,其它的spark session无法获取
单行查询
package com.baizhi.sql
import org.apache.spark.sql.SparkSession
object DataFrameSqlOpt {
def main(args: Array[String]): Unit = {
// 1. sparkSession是spark sql应用入口,内部封装了sparkconf和sparkContext
val spark = SparkSession
.builder()
.appName("the first spark sql example")
.master("local[*]")
.getOrCreate()
// 2. 创建Dataset
val rdd = spark.sparkContext.makeRDD(List("Hello Hadoop", "Hello Scala")).flatMap(_.split(" ")).map((_, 1))
import spark.implicits._
val df = rdd.toDF("word", "num")
// 给df起了表名 如果是全局表的话,在访问的时候需要加数据库名【】
// df.createGlobalTempView("t_user") // 对DF创建全局的临时视图,它产生的表,可以多个spark session共享,它的生命周期和spark application绑定
df.createTempView("t_user") // 对DF创建局部的临时视图,它产生的表,仅供创建spark session使用,其它的spark session无法获取
// 再创建一个session,请问是否能够使用全局表? 正确
// val newSparkSession = spark.newSession()
// spark.sql("select * from global_temp.t_user").show()
// newSparkSession.sql("select * from global_temp.t_user").show()
// 再创建一个session,请问是否能够使用局部临时表? 错误
val newSparkSession = spark.newSession()
spark.sql("select * from t_user").show()
newSparkSession.sql("select * from t_user").show()
spark.stop()
}
}
模糊查询
import spark.implicits._
val userDF = List((1, "zs", true, 18, 15000, 1), (2, "ls", false, 19, 15000, 1)).toDF("id", "name", "sex", "age", "salary", "dept")
userDF.createTempView("t_user")
spark.sql("select * from t_user where name like '%z%' and age > 18").show()
排序查询
// 排序查询
spark.sql(
// 自动将"""引起的内容 进行字符串拼接
"""
select
*
from t_user
order by id desc
""").show()
分组查询
spark.sql(
"""
select
sex,avg(salary)
as avg_salary
from
t_user
group
by sex
""").show()
//---------------------------------------------------------------------------
+-----+----------+
| sex|avg_salary|
+-----+----------+
| true| 15000.0|
|false| 15000.0|
+-----+----------+
Limit
限制返回结果条数
// 分组查询 统计男和女的平均工资
spark.sql(
"""
select
sex,avg(salary)
as avg_salary
from
t_user
group
by sex
limit 1
""").show()
//---------------------------------------------------------------------------
+----+----------+
| sex|avg_salary|
+----+----------+
|true| 15000.0|
+----+----------+
having 分组后过滤
spark.sql(
"""
select
sex,avg(salary)
as avg_salary
from
t_user
group
by sex
having
sex = true
""").show()
//---------------------------------------------------------------------------
+----+----------+
| sex|avg_salary|
+----+----------+
|true| 15000.0|
+----+----------+
Case … when 语句
spark.sql(
"""
| select
| id,name,salary,age,
| case sex
| when true
| then '男'
| when false
| then '女'
| else
| '中性'
| end
| as newSex
| from
| t_user
""".stripMargin).show()
//----------------------------------------------------------------------------
+---+----+------+---+------+
| id|name|salary|age|newSex|
+---+----+------+---+------+
| 1| zs| 15000| 18| 男|
| 2| ls| 15000| 19| 女|
| 3| ww| 18000| 19| 女|
+---+----+------+---+------+
Pivot(行转列)
// pivot
val scoreDF=List(
(1, "语文", 100),
(1, "数学", 100),
(1, "英语", 100),
(2, "数学", 79),
(2, "语文", 80),
(2, "英语", 100),
(2, "英语", 120))
.toDF("id","course","score")
scoreDF.createOrReplaceTempView("t_course")
// 注意: 缺省的列会作为分组的依据
spark.sql(
"""
| select
| *
| from
| t_course
| pivot(max(score) for course in('语文','数学','英语'))
|
""".stripMargin).show()
//----------------------------------------------------------------------------
+---+----+----+----+
| id|语文|数学|英语|
+---+----+----+----+
| 1| 100| 100| 100|
| 2| 80| 79| 120|
+---+----+----+----+
Cube(多维度分组)
// cube (A,B)
// A null
// null B
// A B
val df2 = List(
(110, 50, 80, 80),
(120, 60, 95, 75),
(120, 50, 96, 70))
.toDF("height", "weight", "uiq", "ueq")
df2.createTempView("tt_user")
spark.sql(
"""
| select
| height,uiq,avg(uiq)
| from
| tt_user
| group by
| cube(height,uiq)
""".stripMargin).show()
//-----------------------------------------------------------------
+------+----+-----------------+
|height| uiq| avg(uiq)|
+------+----+-----------------+
| 120|null| 95.5|
| null| 80| 80.0|
| null|null|90.33333333333333|
| null| 95| 95.0|
| 120| 95| 95.0|
| 110|null| 80.0|
| 110| 80| 80.0|
| 120| 96| 96.0|
| null| 96| 96.0|
+------+----+-----------------+
Join表连接查询
// join
val userInfoDF = spark.sparkContext.makeRDD(List((1, "zs"), (2, "ls"), (3, "ww"))).toDF("id", "name")
val orderInfoDF = spark.sparkContext.makeRDD(List((1, "iphone", 1000, 1), (2, "mi9", 999, 1), (3, "连衣裙", 99, 2))).toDF("oid", "product", "price", "uid")
userInfoDF.createTempView("ttt_user")
orderInfoDF.createTempView("t_order")
// inner left_outer right_outer full cross
spark.sql(
"""
| select
| *
| from
| ttt_user t1
| inner join
| t_order t2
| on
| t1.id = t2.uid
""".stripMargin).show()
子查询
类似于SQL的子查询
// 子查询
val df =
List(
(1, "zs", true, 1, 15000),
(2, "ls", false, 2, 18000),
(3, "ww", false, 2, 14000),
(4, "zl", false, 1, 18000),
(5, "win7", false, 1, 16000)
).toDF("id", "name", "sex", "dept", "salary")
df.createTempView("t_employee")
spark.sql(
"""
select
id,
name,
sex,
dept
from (select * from t_employee)
""".stripMargin).show()
//-----------------------------------------------------
+---+----+-----+----+
| id|name| sex|dept|
+---+----+-----+----+
| 1| zs| true| 1|
| 2| ls|false| 2|
| 3| ww|false| 2|
| 4| zl|false| 1|
| 5|win7|false| 1|
+---+----+-----+----+
窗口函数
在正常的统计分析中 ,通常使用聚合函数作为分析,聚合分析函数的特点是将n行记录合并成一行,在数据库的统计当中 还有一种统计称为开窗统计,开窗函数可以实现将一行变成多行。可以将数据库查询的每一条记录比作是一幢高楼的一层, 开窗函数就是在每一层开一扇窗, 让每一层能看到整装楼的全貌或一部分。
语法:
窗口函数名() over([partition by 分区字段] order by 字段 asc | desc [range | rows between unbounded preceding and unbounded following] )
// 创建DF
val df = List(
(1, "zs", true, 1, 15000),
(2, "ls", false, 2, 18000),
(3, "ww", false, 2, 14000),
(4, "zl", false, 1, 18000),
(5, "win7", false, 1, 16000)).toDF("id", "name", "sex", "dept", "salary")
df.createTempView("t_employee")
// 窗口函数名() over([partition by 分区字段] order by 字段 asc | desc [range | rows between unbounded preceding and unbounded following] )
spark.sql(
"""
| select
| id,name,sex,dept,salary,
| sum(id) over(partition by dept order by salary rows between unbounded preceding and unbounded following) as sum_id,
| sum(id) over(partition by dept order by salary) as sum_id2,
| sum(id) over() as sum_id3,
| sum(id) over(partition by dept order by salary rows between 1 preceding and 1 following) as sum_id4,
| sum(id) over(partition by dept order by salary range between 1000 preceding and 2000 following) as sum_id5,
| row_number() over(partition by dept order by salary) as rn,
| rank(salary) over(partition by dept order by salary) as salary_rank,
| dense_rank(salary) over(partition by dept order by salary asc) as salary_rank2,
| lag(salary,2) over(partition by dept order by salary asc) as lag2
| from
| t_employee
""".stripMargin).show()
// 第一个表示 以任意的一行数据为基准都可以看到窗口的所有数据
// 第二个表示 没有加任何的数据可视范围,使用默认的数据可视范围 rowsBetween[Long.min_value,0]
// 第三个表示 over没有声明任何的窗口函数内容,则在每行显示整张表的聚合结果 // agg = 15
// 第四个表示 以当前行为基准 上一行和下一行 rowsBetween[-1,1]
// 第五个表示 数据可视范围区间 range between[当前数据排序字段为基准-下界,当前数据排序字段为基准+上界]
// 第六个表示 排序窗口函数 row_number() 给窗口的数据添加一个序号 类似与Oracle伪列rownum
// 第七个表示 对窗口函数 rank(排名字段) 给窗口的数据按照排名字段信息排名 注意: 非密集或者非连续的排名
// 第八个表示 对窗口函数 dense_rank(连续密集排名字段) 给窗口的数据按照排名字段信息排名 注意: 密集或者连续的排名
// 第九个表示 获取往上两行的slary的值 作为当前行窗口的值
spark.stop()
//--------------------------------------------------------------------------------
+---+----+-----+----+------+------+-------+-------+-------+-------+---+-----------+------------+-----+
| id|name| sex|dept|salary|sum_id|sum_id2|sum_id3|sum_id4|sum_id5| rn|salary_rank|salary_rank2| lag2|
+---+----+-----+----+------+------+-------+-------+-------+-------+---+-----------+------------+-----+
| 1| zs| true| 1| 15000| 23| 1| 28| 6| 6| 1| 1| 1| null|
| 5|win7|false| 1| 16000| 23| 6| 28| 10| 16| 2| 2| 2| null|
| 4| zl|false| 1| 18000| 23| 16| 28| 15| 17| 3| 3| 3|15000|
| 6| wb|false| 1| 18000| 23| 16| 28| 17| 17| 4| 3| 3|16000|
| 7| wb2|false| 1| 20000| 23| 23| 28| 13| 7| 5| 5| 4|18000|
| 3| ww|false| 2| 14000| 5| 3| 28| 5| 3| 1| 1| 1| null|
| 2| ls|false| 2| 18000| 5| 5| 28| 5| 2| 2| 2| 2| null|
+---+----+-----+----+------+------+-------+-------+-------+-------+---+-----------+------------+-----+
unbounded preceding 等价于 Long.min_value
unbounded following 等价于 Long.max_value
current row: 当前行
current row - 1 : 当前行的上一行
current row +1 : 当前行的下一行
Spark SQL的自定义函数
单行函数
对每一行数据应用函数内容,如:Upper() Lower() Length()
package com.baizhi
import org.apache.spark.sql.SparkSession
/**
*
*/
object CustomUserSingleFunction1 {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("window function").master("local[*]").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
import spark.implicits._
// 创建DF
val df = List(
(1, "zs", true, 1, 15000),
(2, "ls", false, 2, 18000),
(3, "ww", false, 2, 14000),
(4, "zl", false, 1, 18000),
(5, "win7", false, 1, 16000),
(6, "wb", false, 1, 18000),
(7, "wb2", false, 1, 20000)
).toDF("id", "name", "sex", "dept", "salary")
df.createTempView("t_employee")
// 自定义单行函数
spark.udf.register("sex_converter", (sex: Boolean) => {
sex match {
case true => "男"
case false => "女"
case _ => "不男不女"
}
})
spark.sql("select id,upper(name),sex_converter(sex) from t_employee").show()
spark.stop()
}
}
//----------------------------------------------------------------
+---+-----------+----------------------+
| id|upper(name)|UDF:sex_converter(sex)|
+---+-----------+----------------------+
| 1| ZS| 男|
| 2| LS| 女|
| 3| WW| 女|
| 4| ZL| 女|
| 5| WIN7| 女|
| 6| WB| 女|
| 7| WB2| 女|
+---+-----------+----------------------+
多行函数
指对多行数据应用函数内容,返回单个结果. 如:聚合函数 sum() avg() min()…
需求:自定义 整数求和的多行函数
package com.baizhi
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{DataType, IntegerType, StructType}
/**
*
*/
object CustomUserSingleFunction2 {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("window function").master("local[*]").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
import spark.implicits._
// 创建DF
val df = List(
(1, "zs", true, 1, 15000),
(2, "ls", false, 2, 18000),
(3, "ww", false, 2, 14000),
(4, "zl", false, 1, 18000),
(5, "win7", false, 1, 16000),
(6, "wb", false, 1, 18000),
(7, "wb2", false, 1, 20000)
).toDF("id", "name", "sex", "dept", "salary")
df.createTempView("t_employee")
// 自定义多行函数
spark.udf.register("my_sum", new UserDefinedAggregateFunction {
/**
* 输入数据的结构类型
*
* @return
*/
override def inputSchema: StructType = new StructType().add("salary", IntegerType)
/**
* 缓冲区【用来存放聚合产生的临时结果】的结构类型
*
* @return
*/
override def bufferSchema: StructType = new StructType().add("total", IntegerType)
/**
* 聚合操作结束后的返回值类型
*
* @return
*/
override def dataType: DataType = IntegerType
/**
* 聚合操作时,输入类型和聚合结果的返回类型是否匹配
*
* @return
*/
override def deterministic: Boolean = true
/**
* 初始化方法
*
* @param buffer
*/
override def initialize(buffer: MutableAggregationBuffer): Unit = {
// buffer缓冲区的第一个位置 存放了一个初始值0
buffer.update(0, 0)
}
/**
* 修改方法
*
* @param buffer
* @param input
*/
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
// 获取多行函数的第一个参数的值
val rowValue = input.getInt(0)
val currentValue = buffer.getInt(0)
buffer.update(0, rowValue + currentValue)
}
/**
* 合并
* 将两个buffer中的数据合并 并将最终结果保存到第一个buffer中
*
* @param buffer1
* @param buffer2
*/
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
val b1CurrentValue = buffer1.getInt(0)
val b2CurrentValue = buffer2.getInt(0)
buffer1.update(0, b1CurrentValue + b2CurrentValue)
}
/**
* 评估方法 返回聚合结果
*
* @param buffer
* @return
*/
override def evaluate(buffer: Row): Any = buffer.getInt(0)
})
spark.sql("select my_sum(salary) from t_employee").show()
spark.stop()
}
}
//------------------------------------------------------------------------------------------
+--------------+
|anon$1(salary)|
+--------------+
| 119000|
+--------------+