Spark SQL RowEncoder 源码分析
文章目录
1. 源码适用场景
Spark SQL RowEncoder 是一个编码器工厂,用于将外部行转换为内部二进制表示的编码器。它适用于以下场景:
- 将外部数据源的数据转换为 Spark SQL 内部数据格式。
- 在 Spark SQL 中执行查询时,将结果序列化为二进制格式以进行传输和存储。
- 将内部数据格式反序列化为外部数据格式,以便与外部系统进行交互。
2. 该源码方法总结归纳
RowEncoder 对象中包含了以下几个方法:
apply(schema: StructType): ExpressionEncoder[Row]
:根据给定的 StructType 构造一个 ExpressionEncoder,用于将 Row 对象转换为内部二进制表示。private def serializerFor(inputObject: Expression, inputType: DataType): Expression
:根据输入类型生成对应的序列化表达式,将输入对象转换为内部二进制表示。def externalDataTypeForInput(dt: DataType): DataType
:返回可用于生成将输入数据转换为 Spark SQL 内部格式的代码的 DataType。与 externalDataTypeFor 不同,此函数返回的 DataType 可能更宽松,因为多个外部类型可能映射到单个内部类型。def externalDataTypeFor(dt: DataType): DataType
:返回可用于将给定 DataType 转换为外部数据格式的 DataType。这里的返回值可以是更宽泛的类型,以支持多种外部数据格式。private def deserializerFor(schema: StructType): Expression
:根据给定的 StructType 生成一个表达式,用于将内部二进制表示反序列化为 Row 对象。private def deserializerFor(input: Expression, dataType: DataType): Expression
:根据输入类型生成对应的反序列化表达式,将输入对象转换为对应的数据类型。
3. 主要用法及其代码示例
3.1 使用 RowEncoder 将 Row 转换为二进制表示
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.types._
val spark = SparkSession.builder().appName("RowEncoderExample").getOrCreate()
// 定义数据结构
val schema = StructType(Seq(
StructField("name", StringType),
StructField("age", IntegerType),
StructField("salary", DoubleType)
))
// 创建 Row 对象
val row = Row("Alice", 30, 5000.0)
// 创建 RowEncoder
val encoder: ExpressionEncoder[Row] = RowEncoder(schema)
// 将 Row 对象编码为二进制表示
val binaryData: Array[Byte] = encoder.toRow(row).asInstanceOf[Array[Byte]]
3.2 使用 RowEncoder 将二进制表示反序列化为 Row
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.types._
val spark = SparkSession.builder().appName("RowEncoderExample").getOrCreate()
// 定义数据结构
val schema = StructType(Seq(
StructField("name", StringType),
StructField("age", IntegerType),
StructField("salary", DoubleType)
))
// 创建二进制数据
val binaryData: Array[Byte] = ???
// 创建 RowEncoder
val encoder: ExpressionEncoder[Row] = RowEncoder(schema)
// 将二进制数据反序列化为 Row 对象
val row: Row = encoder.fromRow(binaryData)
4. 官方链接
5. 源码
/**
* A factory for constructing encoders that convert external row to/from the Spark SQL
* internal binary representation.
*
* The following is a mapping between Spark SQL types and its allowed external types:
* {{{
* BooleanType -> java.lang.Boolean
* ByteType -> java.lang.Byte
* ShortType -> java.lang.Short
* IntegerType -> java.lang.Integer
* FloatType -> java.lang.Float
* DoubleType -> java.lang.Double
* StringType -> String
* DecimalType -> java.math.BigDecimal or scala.math.BigDecimal or Decimal
*
* DateType -> java.sql.Date
* TimestampType -> java.sql.Timestamp
*
* BinaryType -> byte array
* ArrayType -> scala.collection.Seq or Array
* MapType -> scala.collection.Map
* StructType -> org.apache.spark.sql.Row
* }}}
*/
object RowEncoder {
def apply(schema: StructType): ExpressionEncoder[Row] = {
val cls = classOf[Row]
val inputObject = BoundReference(0, ObjectType(cls), nullable = true)
val serializer = serializerFor(AssertNotNull(inputObject, Seq("top level row object")), schema)
val deserializer = deserializerFor(schema)
new ExpressionEncoder[Row](
schema,
flat = false,
serializer.asInstanceOf[CreateNamedStruct].flatten,
deserializer,
ClassTag(cls))
}
private def serializerFor(
inputObject: Expression,
inputType: DataType): Expression = inputType match {
case dt if ScalaReflection.isNativeType(dt) => inputObject
case p: PythonUserDefinedType => serializerFor(inputObject, p.sqlType)
case udt: UserDefinedType[_] =>
val annotation = udt.userClass.getAnnotation(classOf[SQLUserDefinedType])
val udtClass: Class[_] = if (annotation != null) {
annotation.udt()
} else {
UDTRegistration.getUDTFor(udt.userClass.getName).getOrElse {
throw new SparkException(s"${udt.userClass.getName} is not annotated with " +
"SQLUserDefinedType nor registered with UDTRegistration.}")
}
}
val obj = NewInstance(
udtClass,
Nil,
dataType = ObjectType(udtClass), false)
Invoke(obj, "serialize", udt, inputObject :: Nil, returnNullable = false)
case TimestampType =>
StaticInvoke(
DateTimeUtils.getClass,
TimestampType,
"fromJavaTimestamp",
inputObject :: Nil,
returnNullable = false)
case DateType =>
StaticInvoke(
DateTimeUtils.getClass,
DateType,
"fromJavaDate",
inputObject :: Nil,
returnNullable = false)
case d: DecimalType =>
CheckOverflow(StaticInvoke(
Decimal.getClass,
d,
"fromDecimal",
inputObject :: Nil,
returnNullable = false), d)
case StringType =>
StaticInvoke(
classOf[UTF8String],
StringType,
"fromString",
inputObject :: Nil,
returnNullable = false)
case t @ ArrayType(et, containsNull) =>
et match {
case BooleanType | ByteType | ShortType | IntegerType | LongType | FloatType | DoubleType =>
StaticInvoke(
classOf[ArrayData],
t,
"toArrayData",
inputObject :: Nil,
returnNullable = false)
case _ => MapObjects(
element => {
val value = serializerFor(ValidateExternalType(element, et), et)
if (!containsNull) {
AssertNotNull(value, Seq.empty)
} else {
value
}
},
inputObject,
ObjectType(classOf[Object]))
}
case t @ MapType(kt, vt, valueNullable) =>
val keys =
Invoke(
Invoke(inputObject, "keysIterator", ObjectType(classOf[scala.collection.Iterator[_]]),
returnNullable = false),
"toSeq",
ObjectType(classOf[scala.collection.Seq[_]]), returnNullable = false)
val convertedKeys = serializerFor(keys, ArrayType(kt, false))
val values =
Invoke(
Invoke(inputObject, "valuesIterator", ObjectType(classOf[scala.collection.Iterator[_]]),
returnNullable = false),
"toSeq",
ObjectType(classOf[scala.collection.Seq[_]]), returnNullable = false)
val convertedValues = serializerFor(values, ArrayType(vt, valueNullable))
val nonNullOutput = NewInstance(
classOf[ArrayBasedMapData],
convertedKeys :: convertedValues :: Nil,
dataType = t,
propagateNull = false)
if (inputObject.nullable) {
If(IsNull(inputObject),
Literal.create(null, inputType),
nonNullOutput)
} else {
nonNullOutput
}
case StructType(fields) =>
val nonNullOutput = CreateNamedStruct(fields.zipWithIndex.flatMap { case (field, index) =>
val fieldValue = serializerFor(
ValidateExternalType(
GetExternalRowField(inputObject, index, field.name),
field.dataType),
field.dataType)
val convertedField = if (field.nullable) {
If(
Invoke(inputObject, "isNullAt", BooleanType, Literal(index) :: Nil),
Literal.create(null, field.dataType),
fieldValue
)
} else {
fieldValue
}
Literal(field.name) :: convertedField :: Nil
})
if (inputObject.nullable) {
If(IsNull(inputObject),
Literal.create(null, inputType),
nonNullOutput)
} else {
nonNullOutput
}
}
/**
* Returns the `DataType` that can be used when generating code that converts input data
* into the Spark SQL internal format. Unlike `externalDataTypeFor`, the `DataType` returned
* by this function can be more permissive since multiple external types may map to a single
* internal type. For example, for an input with DecimalType in external row, its external types
* can be `scala.math.BigDecimal`, `java.math.BigDecimal`, or
* `org.apache.spark.sql.types.Decimal`.
*/
def externalDataTypeForInput(dt: DataType): DataType = dt match {
// In order to support both Decimal and java/scala BigDecimal in external row, we make this
// as java.lang.Object.
case _: DecimalType => ObjectType(classOf[java.lang.Object])
// In order to support both Array and Seq in external row, we make this as java.lang.Object.
case _: ArrayType => ObjectType(classOf[java.lang.Object])
case _ => externalDataTypeFor(dt)
}
def externalDataTypeFor(dt: DataType): DataType = dt match {
case _ if ScalaReflection.isNativeType(dt) => dt
case TimestampType => ObjectType(classOf[java.sql.Timestamp])
case DateType => ObjectType(classOf[java.sql.Date])
case _: DecimalType => ObjectType(classOf[java.math.BigDecimal])
case StringType => ObjectType(classOf[java.lang.String])
case _: ArrayType => ObjectType(classOf[scala.collection.Seq[_]])
case _: MapType => ObjectType(classOf[scala.collection.Map[_, _]])
case _: StructType => ObjectType(classOf[Row])
case p: PythonUserDefinedType => externalDataTypeFor(p.sqlType)
case udt: UserDefinedType[_] => ObjectType(udt.userClass)
}
private def deserializerFor(schema: StructType): Expression = {
val fields = schema.zipWithIndex.map { case (f, i) =>
val dt = f.dataType match {
case p: PythonUserDefinedType => p.sqlType
case other => other
}
deserializerFor(GetColumnByOrdinal(i, dt))
}
CreateExternalRow(fields, schema)
}
private def deserializerFor(input: Expression): Expression = {
deserializerFor(input, input.dataType)
}
private def deserializerFor(input: Expression, dataType: DataType): Expression = dataType match {
case dt if ScalaReflection.isNativeType(dt) => input
case p: PythonUserDefinedType => deserializerFor(input, p.sqlType)
case udt: UserDefinedType[_] =>
val annotation = udt.userClass.getAnnotation(classOf[SQLUserDefinedType])
val udtClass: Class[_] = if (annotation != null) {
annotation.udt()
} else {
UDTRegistration.getUDTFor(udt.userClass.getName).getOrElse {
throw new SparkException(s"${udt.userClass.getName} is not annotated with " +
"SQLUserDefinedType nor registered with UDTRegistration.}")
}
}
val obj = NewInstance(
udtClass,
Nil,
dataType = ObjectType(udtClass))
Invoke(obj, "deserialize", ObjectType(udt.userClass), input :: Nil)
case TimestampType =>
StaticInvoke(
DateTimeUtils.getClass,
ObjectType(classOf[java.sql.Timestamp]),
"toJavaTimestamp",
input :: Nil,
returnNullable = false)
case DateType =>
StaticInvoke(
DateTimeUtils.getClass,
ObjectType(classOf[java.sql.Date]),
"toJavaDate",
input :: Nil,
returnNullable = false)
case _: DecimalType =>
Invoke(input, "toJavaBigDecimal", ObjectType(classOf[java.math.BigDecimal]),
returnNullable = false)
case StringType =>
Invoke(input, "toString", ObjectType(classOf[String]), returnNullable = false)
case ArrayType(et, nullable) =>
val arrayData =
Invoke(
MapObjects(deserializerFor(_), input, et),
"array",
ObjectType(classOf[Array[_]]), returnNullable = false)
StaticInvoke(
scala.collection.mutable.WrappedArray.getClass,
ObjectType(classOf[Seq[_]]),
"make",
arrayData :: Nil,
returnNullable = false)
case MapType(kt, vt, valueNullable) =>
val keyArrayType = ArrayType(kt, false)
val keyData = deserializerFor(Invoke(input, "keyArray", keyArrayType))
val valueArrayType = ArrayType(vt, valueNullable)
val valueData = deserializerFor(Invoke(input, "valueArray", valueArrayType))
StaticInvoke(
ArrayBasedMapData.getClass,
ObjectType(classOf[Map[_, _]]),
"toScalaMap",
keyData :: valueData :: Nil,
returnNullable = false)
case schema @ StructType(fields) =>
val convertedFields = fields.zipWithIndex.map { case (f, i) =>
If(
Invoke(input, "isNullAt", BooleanType, Literal(i) :: Nil),
Literal.create(null, externalDataTypeFor(f.dataType)),
deserializerFor(GetStructField(input, i)))
}
If(IsNull(input),
Literal.create(null, externalDataTypeFor(input.dataType)),
CreateExternalRow(convertedFields, schema))
}
}