/**
* A class for writing JVM objects directly to a file on disk. This class allows data to be appended
* to an existing block and can guarantee atomicity in the case of faults as it allows the caller to
* revert partial writes.
*
* This class does not support concurrent writes. Also, once the writer has been opened it cannot be
* reopened again.
*/
private[spark] class DiskBlockObjectWriter(
val file: File,
serializerInstance: SerializerInstance,
bufferSize: Int,
compressStream: OutputStream => OutputStream,
syncWrites: Boolean,
// These write metrics concurrently shared with other active DiskBlockObjectWriters who
// are themselves performing writes. All updates must be relative.
writeMetrics: ShuffleWriteMetrics,
val blockId: BlockId = null)
extends OutputStream
with Logging {
/**
* A short circuited method to get a block writer that can write data directly to disk.
* The Block will be appended to the File specified by filename. Callers should handle error
* cases.
*/
def getDiskWriter(
blockId: BlockId,
file: File,
serializerInstance: SerializerInstance,
bufferSize: Int,
writeMetrics: ShuffleWriteMetrics): DiskBlockObjectWriter = {
val compressStream: OutputStream => OutputStream =
serializerManager.wrapForCompression(blockId, _)
val syncWrites = conf.getBoolean("spark.shuffle.sync", false)
new DiskBlockObjectWriter(file, serializerInstance, bufferSize, compressStream,
syncWrites, writeMetrics, blockId)
}
/**
* Cursors used to represent positions in the file.
*
* xxxxxxxx|--------|--- |
* ^ ^ ^
* | | finalPosition
* | reportedPosition
* initialPosition
*
* initialPosition: Offset in the file where we start writing. Immutable.
* reportedPosition: Position at the time of the last update to the write metrics.
* finalPosition: Offset where we stopped writing. Set on closeAndCommit() then never changed.
* -----: Current writes to the underlying file.
* xxxxx: Existing contents of the file.
*/
private val initialPosition = file.length()
private var finalPosition: Long = -1
private var reportedPosition = initialPosition
def open(): DiskBlockObjectWriter = {
if (hasBeenClosed) {
throw new IllegalStateException("Writer already closed. Cannot be reopened.")
}
fos = new FileOutputStream(file, true)
ts = new TimeTrackingOutputStream(writeMetrics, fos)
channel = fos.getChannel()
bs = compressStream(new BufferedOutputStream(ts, bufferSize))
objOut = serializerInstance.serializeStream(bs)
initialized = true
this
}
override def close() {
if (initialized) {
Utils.tryWithSafeFinally {
if (syncWrites) {
// Force outstanding writes to disk and track how long it takes
objOut.flush()
val start = System.nanoTime()
fos.getFD.sync()
writeMetrics.incWriteTime(System.nanoTime() - start)
}
} {
objOut.close()
}
channel = null
bs = null
fos = null
ts = null
objOut = null
initialized = false
hasBeenClosed = true
}
}
/**
* Flush the partial writes and commit them as a single atomic block.
*/
def commitAndClose(): Unit = {
if (initialized) {
// NOTE: Because Kryo doesn't flush the underlying stream we explicitly flush both the
// serializer stream and the lower level stream.
objOut.flush()
bs.flush()
close()
finalPosition = file.length()
// In certain compression codecs, more bytes are written after close() is called
writeMetrics.incBytesWritten(finalPosition - reportedPosition)
} else {
finalPosition = file.length()
}
commitAndCloseHasBeenCalled = true
}
/**
* Reverts writes that haven't been flushed yet. Callers should invoke this function
* when there are runtime exceptions. This method will not throw, though it may be
* unsuccessful in truncating written data.
*
* @return the file that this DiskBlockObjectWriter wrote to.
*/
def revertPartialWritesAndClose(): File = {
// Discard current writes. We do this by flushing the outstanding writes and then
// truncating the file to its initial position.
try {
if (initialized) {
writeMetrics.decBytesWritten(reportedPosition - initialPosition)
writeMetrics.decRecordsWritten(numRecordsWritten)
objOut.flush()
bs.flush()
close()
}
val truncateStream = new FileOutputStream(file, true)
try {
truncateStream.getChannel.truncate(initialPosition)
file
} finally {
truncateStream.close()
}
} catch {
case e: Exception =>
logError("Uncaught exception while reverting partial writes to file " + file, e)
file
}
}
/**
* Writes a key-value pair.
*/
def write(key: Any, value: Any) {
if (!initialized) {
open()
}
objOut.writeKey(key)
objOut.writeValue(value)
recordWritten()
}
/**
* Returns the file segment of committed data that this Writer has written.
* This is only valid after commitAndClose() has been called.
*/
def fileSegment(): FileSegment = {
if (!commitAndCloseHasBeenCalled) {
throw new IllegalStateException(
"fileSegment() is only valid after commitAndClose() has been called")
}
new FileSegment(file, initialPosition, finalPosition - initialPosition)
}