FileScanRDD
./sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
/**
* An RDD that scans a list of file partitions.
*/
class FileScanRDD(
@transient private val sparkSession: SparkSession,
readFunction: (PartitionedFile) => Iterator[InternalRow],
@transient val filePartitions: Seq[FilePartition])
extends RDD[InternalRow](sparkSession.sparkContext, Nil) {
override def compute(split: RDDPartition, context: TaskContext): Iterator[InternalRow] = {
...
...
private[this] val files = split.asInstanceOf[FilePartition].files.toIterator
private[this] var currentFile: PartitionedFile = null
...
...
private def readCurrentFile(): Iterator[InternalRow] = {
try {
readFunction(currentFile)
} catch {
case e: FileNotFoundException =>
throw QueryExecutionErrors.readCurrentFileNotFoundError(e)
}
}
}
}
FilePartition
/**
* A collection of file blocks that should be read as a single task
* (possibly from multiple partitioned directories).
*/
case class FilePartition(index: Int, files: Array[PartitionedFile])
extends Partition with InputPartition {
override def preferredLocations(): Array[String] = {
// Computes total number of bytes can be retrieved from each host.
val hostToNumBytes = mutable.HashMap.empty[String, Long]
files.foreach { file =>
file.locations.filter(_ != "localhost").foreach { host =>
hostToNumBytes(host) = hostToNumBytes.getOrElse(host, 0L) + file.length
}
}
// Takes the first 3 hosts with the most data to be retrieved
hostToNumBytes.toSeq.sortBy {
case (host, numBytes) => numBytes
}.reverse.take(3).map {
case (host, numBytes) => host
}.toArray
}
}
PartitionedFile
/**
* A part (i.e. "block") of a single file that should be read, along with partition column values
* that need to be prepended to each row.
*
* @param partitionValues value of partition columns to be prepended to each row.
* @param filePath URI of the file to read
* @param start the beginning offset (in bytes) of the block.
* @param length number of bytes to read.
* @param locations locality information (list of nodes that have the data).
*/
case class PartitionedFile(
partitionValues: InternalRow,
filePath: String,
start: Long,
length: Long,
@transient locations: Array[String] = Array.empty) {
override def toString: String = {
s"path: $filePath, range: $start-${start + length}, partition values: $partitionValues"
}
}
PartitioningAwareFileIndex
override def listFiles(
partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): Seq[PartitionDirectory] = {
def isNonEmptyFile(f: FileStatus): Boolean = {
isDataPath(f.getPath) && f.getLen > 0
}
val selectedPartitions = if (partitionSpec().partitionColumns.isEmpty) {
PartitionDirectory(InternalRow.empty, allFiles().filter(isNonEmptyFile)) :: Nil
} else {
if (recursiveFileLookup) {
throw new IllegalArgumentException(
"Datasource with partition do not allow recursive file loading.")
}
prunePartitions(partitionFilters, partitionSpec()).map {
case PartitionPath(values, path) =>
val files: Seq[FileStatus] = leafDirToChildrenFiles.get(path) match {
case Some(existingDir) =>
// Directory has children files in it, return them
existingDir.filter(f => matchPathPattern(f) && isNonEmptyFile(f))
case None =>
// Directory does not exist, or has no children files
Nil
}
PartitionDirectory(values, files)
}
}
logTrace("Selected files after partition pruning:\n\t" + selectedPartitions.mkString("\n\t"))
selectedPartitions
}