一.基础操作
1.添加依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>3.0.0</version>
</dependency>
2.编程实现
2.1创建对象
//创建上下文环境配置对象
val conf: SparkConf = new
SparkConf().setMaster("local[*]").setAppName("SparkSQL01_Demo")
//创建 SparkSession 对象
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
spark.close()
2.2 读取文件
在datas目录下创建user.json文件输入
{“username”:“zhangsan”,“age”:20}
{“username”:“lisi”,“age”:30}
{“username”:“wangwu”,“age”:40}
val df: DataFrame = spark.read.json("datas/user.json")
df.show()
输出:
+---+--------+
|age|username|
+---+--------+
| 20|zhangsan|
| 30| lisi|
| 40| wangwu|
+---+--------+
2.3 sql语法
df.createOrReplaceTempView("user")
spark.sql("select * from user").show
spark.sql("select age, username from user").show
spark.sql("select avg(age) from user").show
输出:
+---+--------+
|age|username|
+---+--------+
| 20|zhangsan|
| 30| lisi|
| 40| wangwu|
+---+--------+
+---+--------+
|age|username|
+---+--------+
| 20|zhangsan|
| 30| lisi|
| 40| wangwu|
+---+--------+
+--------+
|avg(age)|
+--------+
| 30.0|
+--------+
2.4 dsl语法
在 IDEA 中开发程序时,如果需要 RDD 与 DF 或者 DS 之间互相操作,那么需要引入 import spark.implicits._
这里的 spark 不是 Scala 中的包名,而是创建的 sparkSession 对象的变量名称,所以必须先创建 SparkSession 对象再导入。这里的 spark 对象不能使用 var 声明,因为 Scala 只支持val 修饰的对象的引入。
import spark.implicits._
val df: DataFrame = spark.read.json("datas/user.json")
df.select("age", "username").show
df.select($"age" + 1).show
df.select('age + 1).show
输出:
+---+--------+
|age|username|
+---+--------+
| 20|zhangsan|
| 30| lisi|
| 40| wangwu|
+---+--------+
+---------+
|(age + 1)|
+---------+
| 21|
| 31|
| 41|
+---------+
+---------+
|(age + 1)|
+---------+
| 21|
| 31|
| 41|
+---------+
2.5 DataSet
val seq = Seq(1,2,3,4)
val ds: Dataset[Int] = seq.toDS()
ds.show()
输出:
+-----+
|value|
+-----+
| 1|
| 2|
| 3|
| 4|
+-----+
2.6 RDD,DataFrame,DataSet转换
case class User( id:Int, name:String, age:Int )
// RDD <=> DataFrame
val rdd = spark.sparkContext.makeRDD(List((1, "zhangsan", 30), (2, "lisi", 40)))
val df: DataFrame = rdd.toDF("id", "name", "age")
val rowRDD: RDD[Row] = df.rdd
// DataFrame <=> DataSet
val ds: Dataset[User] = df.as[User]
val df1: DataFrame = ds.toDF()
// RDD <=> DataSet
val ds1: Dataset[User] = rdd.map {
case (id, name, age) => {
User(id, name, age)
}
}.toDS()
val userRDD: RDD[User] = ds1.rdd
完整代码:
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
object Sql_1 {
def main(args: Array[String]): Unit = {
val sparkConf= new SparkConf().setMaster("local[*]").setAppName("SQL")
.set("spark.testing.memory", "2147480000")
val spark= SparkSession.builder().config(sparkConf).getOrCreate()
// val df:DataFrame = spark.read.json("datas/user.json")
// df.show()
// df.createOrReplaceTempView("user")
// spark.sql("select * from user").show
// spark.sql("select age, username from user").show
// spark.sql("select avg(age) from user").show
import spark.implicits._
// df.select("age", "username").show
// df.select($"age" + 1).show
// df.select('age + 1).show
// TODO DataSet
// DataFrame其实是特定泛型的DataSet
// val seq = Seq(1,2,3,4)
// val ds: Dataset[Int] = seq.toDS()
// ds.show()
// RDD <=> DataFrame
val rdd = spark.sparkContext.makeRDD(List((1, "zhangsan", 30), (2, "lisi", 40)))
val df: DataFrame = rdd.toDF("id", "name", "age")
val rowRDD: RDD[Row] = df.rdd
// DataFrame <=> DataSet
val ds: Dataset[User] = df.as[User]
val df1: DataFrame = ds.toDF()
// RDD <=> DataSet
val ds1: Dataset[User] = rdd.map {
case (id, name, age) => {
User(id, name, age)
}
}.toDS()
val userRDD: RDD[User] = ds1.rdd
spark.close()
}
case class User( id:Int, name:String, age:Int )
}
二.用户自定义函数
在SparkSQL中,目前仅仅支持UDF函数和UDAF函数
-
UDF函数:一对一关系(用的最多);
-
UDAF函数:聚合函数,通常与group by 分组函数连用,多对一关系
1. UDF 用户自定义函数
spark.udf.register(
"xxx",//函数名称
(param01:Type,...)=>{//匿名函数
.......
}
)
示例:在username前面加Name:
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("sparkSQL")
val spark = SparkSession.builder().config(sparkConf).getOrCreate()
import spark.implicits._
val df = spark.read.json("datas/user.json")
//创建临时表
df.createOrReplaceTempView("user")
//注册 UDF
spark.udf.register("prefixName", (name:String) => {
"Name: " + name})
//应用 UDF
spark.sql("select age, prefixName(username) from user").show
输出:
+---+------------------------+
|age|UDF:prefixName(username)|
+---+------------------------+
| 20| Name: zhangsan|
| 30| Name: lisi|
| 40| Name: wangwu|
+---+------------------------+
2.UDAF 用户自定义聚合函数
强类型的 Dataset 和弱类型的 DataFrame 都提供了相关的聚合函数, 如 count(),countDistinct(),avg(),max(),min()。除此之外,用户可以设定自己的自定义聚合函数。通过继承 UserDefinedAggregateFunction 来实现用户自定义弱类型聚合函数。从 Spark3.0 版本后,UserDefinedAggregateFunction 已经不推荐使用了。可以统一采用强类型聚合函数Aggregator
强类型聚合函数输出的结果每一行都是UserBean类型的,是样例类类型,并不像弱类型一样是row
示例:计算平均年龄
(1)实现方式 - UDAF - 弱类型
继承UserDefineAggregateFunction并实现相关方法
/*
自定义聚合函数类:计算年龄的平均值
1. 继承UserDefinedAggregateFunction
2. 重写方法(8)
*/
class MyAvgUDAF extends UserDefinedAggregateFunction{
// 输入数据的结构 : Int
override def inputSchema: StructType = {
// StructType根据源码知道是样例类,所以不用new直接使用
StructType(
Array(
StructField("age", LongType)
)
)
}
// 缓冲区数据的结构 : Buffer
override def bufferSchema: StructType = {
StructType(
Array(
StructField("total", LongType),
StructField("count", LongType)
)
)
}
// 函数计算结果的数据类型:Out
override def dataType: DataType = LongType
// 函数的稳定性
override def deterministic: Boolean = true
// 缓冲区初始化
override def initialize(buffer: MutableAggregationBuffer): Unit = {
//这里第一个参数代表输入的顺序,0是total,1是count。0L是初值为0的Long
buffer.update(0, 0L)
buffer.update(1, 0L)
}
// 根据输入的值更新缓冲区数据
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
//total每次加年龄
buffer.update(0, buffer.getLong(0)+input.getLong(0))
//count每次加1
buffer.update(1, buffer.getLong(1)+1)
}
// 缓冲区数据合并
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
buffer1.update(0, buffer1.getLong(0) + buffer2.getLong(0))
buffer1.update(1, buffer1.getLong(1) + buffer2.getLong(1))
}
// 计算平均值
override def evaluate(buffer: Row): Any = {
buffer.getLong(0)/buffer.getLong(1)
}
}
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{DataType, LongType, StructField, StructType}
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("sparkSQL")
val spark = SparkSession.builder().config(sparkConf).getOrCreate()
val df = spark.read.json("datas/user.json")
df.createOrReplaceTempView("user")
spark.udf.register("ageAvg", new MyAvgUDAF())
spark.sql("select ageAvg(age) from user").show
spark.close()
}
输出:
+--------------+
|myavgudaf(age)|
+--------------+
| 30|
+--------------+
(2) 实现方式 - UDAF - 强类型
需要继承Aggregator并实现相关方法
/*
自定义聚合函数类:计算年龄的平均值
1. 继承org.apache.spark.sql.expressions.Aggregator, 定义泛型
IN : 输入的数据类型 Long
BUF : 缓冲区的数据类型 Buff
OUT : 输出的数据类型 Long
2. 重写方法(6)
*/
case class Buff( var total:Long, var count:Long )
class MyAvgUDAF extends Aggregator[Long, Buff, Long]{
// z & zero : 初始值或零值
// 缓冲区的初始化
override def zero: Buff = {
Buff(0L,0L)
}
// 根据输入的数据更新缓冲区的数据
override def reduce(buff: Buff, in: Long): Buff = {
buff.total = buff.total + in
buff.count = buff.count + 1
buff
}
// 合并缓冲区
override def merge(buff1: Buff, buff2: Buff): Buff = {
buff1.total = buff1.total + buff2.total
buff1.count = buff1.count + buff2.count
buff1
}
//计算结果
override def finish(buff: Buff): Long = {
buff.total / buff.count
}
// 缓冲区的编码操作
override def bufferEncoder: Encoder[Buff] = Encoders.product
// 输出的编码操作
override def outputEncoder: Encoder[Long] = Encoders.scalaLong
}
import org.apache.spark.SparkConf
import org.apache.spark.sql.expressions.{Aggregator, MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{DataType, LongType, StructField, StructType}
import org.apache.spark.sql.{Encoder, Encoders, Row, SparkSession, functions}
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("sparkSQL")
val spark = SparkSession.builder().config(sparkConf).getOrCreate()
val df = spark.read.json("datas/user.json")
df.createOrReplaceTempView("user")
spark.udf.register("ageAvg", functions.udaf(new MyAvgUDAF()))
spark.sql("select ageAvg(age) from user").show
spark.close()
}
输出:
+--------------+
|myavgudaf(age)|
+--------------+
| 30|
+--------------+