Spark-SerializerManager&JavaSerializer&KryoSerializer 源码解析
abstract class Serializer
这个类是 spark 序列化的抽象类,规定了通用的方法。
//保存 类加载器
@volatile protected var defaultClassLoader: Option[ClassLoader] = None
//设置当前的 类加载器
def setDefaultClassLoader(classLoader: ClassLoader): Serializer = {
defaultClassLoader = Some(classLoader)
this
}
//返回 序列化Instance 对象
def newInstance(): SerializerInstance
//是否支持 重定位序列化对象
private[spark] def supportsRelocationOfSerializedObjects: Boolean = false
JavaSerializer
private var counterReset = conf.getInt("spark.serializer.objectStreamReset", 100)
private var extraDebugInfo = conf.getBoolean("spark.serializer.extraDebugInfo", true)
protected def this() = this(new SparkConf()) // For deserialization only
override def newInstance(): SerializerInstance = {
val classLoader = defaultClassLoader.getOrElse(Thread.currentThread.getContextClassLoader)
new JavaSerializerInstance(counterReset, extraDebugInfo, classLoader)
}
override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException {
out.writeInt(counterReset)
out.writeBoolean(extraDebugInfo)
}
override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException {
counterReset = in.readInt()
extraDebugInfo = in.readBoolean()
}
KryoSerializer
abstract class SerializerInstance
序列化Instance 对象 的 抽象类。
//序列化对象
def serialize[T: ClassTag](t: T): ByteBuffer
//反序列化
def deserialize[T: ClassTag](bytes: ByteBuffer): T
//反序列化 使用特定的 类加载器
def deserialize[T: ClassTag](bytes: ByteBuffer, loader: ClassLoader): T
//序列化流
def serializeStream(s: OutputStream): SerializationStream
//反序列化流
def deserializeStream(s: InputStream): DeserializationStream
JavaSerializerInstance
KryoSerializerInstance
abstract class SerializationStream
def writeObject[T: ClassTag](t: T): SerializationStream
def writeKey[T: ClassTag](key: T): SerializationStream = writeObject(key)
def writeValue[T: ClassTag](value: T): SerializationStream = writeObject(value)
def flush(): Unit
override def close(): Unit
def writeAll[T: ClassTag](iter: Iterator[T]): SerializationStream = {
while (iter.hasNext) {
writeObject(iter.next())
}
this
}
JavaSerializationStream
JavaSerializationStream继承了SerializationStream,其内部有2个属性:
private val objOut = new ObjectOutputStream(out) //使用默认的 java 序列化 机制
private var counter = 0 //一个计数器
方法:
//对象序列化的方法
def writeObject[T: ClassTag](t: T): SerializationStream = {
try {
objOut.writeObject(t)
} catch {
case e: NotSerializableException if extraDebugInfo =>
throw SerializationDebugger.improveException(t, e)
}
counter += 1
if (counterReset > 0 && counter >= counterReset) {
objOut.reset()
counter = 0
}
this
}
def flush() { objOut.flush() }
def close() { objOut.close() }
KryoSerializationStream
KryoSerializationStream 继承自SerializationStream
方法:
private[this] var output: KryoOutput =
if (useUnsafe) new KryoUnsafeOutput(outStream) else new KryoOutput(outStream)
private[this] var kryo: Kryo = serInstance.borrowKryo()
override def writeObject[T: ClassTag](t: T): SerializationStream = {
kryo.writeClassAndObject(output, t)
this
}
override def flush() {
if (output == null) {
throw new IOException("Stream is closed")
}
output.flush()
}
override def close() {
if (output != null) {
try {
output.close()
} finally {
serInstance.releaseKryo(kryo)
kryo = null
output = null
}
}
}
abstract class DeserializationStream
def readObject[T: ClassTag](): T
def readKey[T: ClassTag](): T = readObject[T]()
def readValue[T: ClassTag](): T = readObject[T]()
override def close(): Unit
def asIterator: Iterator[Any] = new NextIterator[Any] {
override protected def getNext() = {
try {
readObject[Any]()
} catch {
case eof: EOFException =>
finished = true
null
}
}
override protected def close() {
DeserializationStream.this.close()
}
}
def asKeyValueIterator: Iterator[(Any, Any)] = new NextIterator[(Any, Any)] {
override protected def getNext() = {
try {
(readKey[Any](), readValue[Any]())
} catch {
case eof: EOFException =>
finished = true
null
}
}
override protected def close() {
DeserializationStream.this.close()
}
}
JavaDeserializationStream
JavaDeserializationStream 继承自DeserializationStream。其内部有1个属性:
//使用java 的反序列化机制
private val objIn = new ObjectInputStream(in) {
override def resolveClass(desc: ObjectStreamClass): Class[_] =
try {
// scalastyle:off classforname
Class.forName(desc.getName, false, loader)
// scalastyle:on classforname
} catch {
case e: ClassNotFoundException =>
JavaDeserializationStream.primitiveMappings.getOrElse(desc.getName, throw e)
}
}
def readObject[T: ClassTag](): T = objIn.readObject().asInstanceOf[T]
def close() { objIn.close() }
KryoDeserializationStream
KryoDeserializationStream 继承自 DeserializationStream。
private[this] var input: KryoInput =
if (useUnsafe) new KryoUnsafeInput(inStream) else new KryoInput(inStream)
private[this] var kryo: Kryo = serInstance.borrowKryo()
override def readObject[T: ClassTag](): T = {
try {
kryo.readClassAndObject(input).asInstanceOf[T]
} catch {
// DeserializationStream uses the EOF exception to indicate stopping condition.
case e: KryoException
if e.getMessage.toLowerCase(Locale.ROOT).contains("buffer underflow") =>
throw new EOFException
}
}
override def close() {
if (input != null) {
try {
// Kryo's Input automatically closes the input stream it is using.
input.close()
} finally {
serInstance.releaseKryo(kryo)
kryo = null
input = null
}
}
}
SerializerManager
通过这个类 来管理各个 序列化和反序列化对象的 实现方法。
用来判断某个字段或者类型使用使用使用哪一种序列化实例来序列化。
这个版本的spark已经开始支持kryo的序列化了,所以其内部属性存在一个kryoSerializer。
属性:
private[this] val kryoSerializer = new KryoSerializer(conf) //kyro序列化器实例
private[this] val stringClassTag: ClassTag[String] = implicitly[ClassTag[String]] //string的classTag
private[this] val primitiveAndPrimitiveArrayClassTags: Set[ClassTag[_]] //java基本类型的和对应的 array类型的 classTag
//compressBroadcast 广播压缩 选项
private[this] val compressBroadcast = conf.getBoolean("spark.broadcast.compress", true)
// compressShuffle shuffle压缩 选项
private[this] val compressShuffle = conf.getBoolean("spark.shuffle.compress", true)
// compressRdds rdd压缩 选项
private[this] val compressRdds = conf.getBoolean("spark.rdd.compress", false)
// compressShuffleSpill shuffle spill 选项
private[this] val compressShuffleSpill = conf.getBoolean("spark.shuffle.spill.compress", true)
//shuffle 溢出文件输出 压缩器
private lazy val compressionCodec: CompressionCodec = CompressionCodec.createCodec(conf)
方法:
//设置 kryo 实例 的 加载器
def setDefaultClassLoader(classLoader: ClassLoader): Unit = {
kryoSerializer.setDefaultClassLoader(classLoader)
}
//是否支持加密
def encryptionEnabled: Boolean = encryptionKey.isDefined
//判断是否可以使用kyro 序列化,是java 基本类型,或java基本类型的array类型
def canUseKryo(ct: ClassTag[_]): Boolean = {
primitiveAndPrimitiveArrayClassTags.contains(ct) || ct == stringClassTag
}
//根据 autoPick 和数据的类型 优先选用kyro 序列化器
def getSerializer(ct: ClassTag[_], autoPick: Boolean): Serializer = {
if (autoPick && canUseKryo(ct)) {
kryoSerializer
} else {
defaultSerializer
}
}
//根据 key-value的类型优先选择 kryo 序列化器
def getSerializer(keyClassTag: ClassTag[_], valueClassTag: ClassTag[_]): Serializer = {
if (canUseKryo(keyClassTag) && canUseKryo(valueClassTag)) {
kryoSerializer
} else {
defaultSerializer
}
}
//根据block类型 判断是否需要 压缩
private def shouldCompress(blockId: BlockId): Boolean = {
blockId match {
case _: ShuffleBlockId => compressShuffle
case _: BroadcastBlockId => compressBroadcast
case _: RDDBlockId => compressRdds
case _: TempLocalBlockId => compressShuffleSpill
case _: TempShuffleBlockId => compressShuffle
case _ => false
}
}
//加密 和压缩 一个输入流
def wrapStream(blockId: BlockId, s: InputStream): InputStream = {
wrapForCompression(blockId, wrapForEncryption(s))
}
//加密 和压缩 一个输出流
def wrapStream(blockId: BlockId, s: OutputStream): OutputStream = {
wrapForCompression(blockId, wrapForEncryption(s))
}
//压缩
def wrapForCompression(blockId: BlockId, s: OutputStream): OutputStream = {
if (shouldCompress(blockId)) compressionCodec.compressedOutputStream(s) else s
}
//压缩
def wrapForCompression(blockId: BlockId, s: InputStream): InputStream = {
if (shouldCompress(blockId)) compressionCodec.compressedInputStream(s) else s
}
//序列化数据
def dataSerializeStream[T: ClassTag](
blockId: BlockId,
outputStream: OutputStream,
values: Iterator[T]): Unit = {
val byteStream = new BufferedOutputStream(outputStream)
val autoPick = !blockId.isInstanceOf[StreamBlockId] //kyro不支持 spark stream的序列化
val ser = getSerializer(implicitly[ClassTag[T]], autoPick).newInstance()//获取到 合适的 序列化器
ser.serializeStream(wrapForCompression(blockId, byteStream)).writeAll(values).close() //序列化数据
}
//返回ChunkedByteBuffer 序列化数据
def dataSerialize[T: ClassTag](
blockId: BlockId,
values: Iterator[T]): ChunkedByteBuffer = {
dataSerializeWithExplicitClassTag(blockId, values, implicitly[ClassTag[T]])
}
//ChunkedByteBuffer 序列化数据
def dataSerializeWithExplicitClassTag(
blockId: BlockId,
values: Iterator[_],
classTag: ClassTag[_]): ChunkedByteBuffer = {
val bbos = new ChunkedByteBufferOutputStream(1024 * 1024 * 4, ByteBuffer.allocate)//使用了 ChunkedByteBufferOutputStream 这种流
val byteStream = new BufferedOutputStream(bbos)
val autoPick = !blockId.isInstanceOf[StreamBlockId]
val ser = getSerializer(classTag, autoPick).newInstance()
ser.serializeStream(wrapForCompression(blockId, byteStream)).writeAll(values).close()
bbos.toChunkedByteBuffer
}
//反序列化 数据
def dataDeserializeStream[T](
blockId: BlockId,
inputStream: InputStream)
(classTag: ClassTag[T]): Iterator[T] = {
val stream = new BufferedInputStream(inputStream)
val autoPick = !blockId.isInstanceOf[StreamBlockId]
getSerializer(classTag, autoPick)
.newInstance()
.deserializeStream(wrapForCompression(blockId, stream))
.asIterator.asInstanceOf[Iterator[T]]
}