spark2.1
FileScanRDD
private def nextIterator(): Boolean = {
...
currentIterator = readFunction(currentFile)
...
}
OptimizedParquetFileFormat
override def buildReaderWithPartitionValues(
sparkSession: SparkSession,
dataSchema: StructType,
partitionSchema: StructType,
requiredSchema: StructType,
filters: Seq[Filter],
options: Map[String, String],
hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = {
...
val reader = new OapDataReaderV1(file.filePath, m, partitionSchema, requiredSchema,
filterScanners, requiredIds, pushed, oapMetrics, conf, enableVectorizedReader, options,
filters, context)
reader.read(file)
...
}
OapDataReaderWriter.scala
override def read(file: PartitionedFile): Iterator[InternalRow] =
{
...
val iter = initialize()
...
}
def fullScan: OapCompletionIterator[Any] = {
val start = if (log.isDebugEnabled) System.currentTimeMillis else 0
//initialize goto here
val iter = fileScanner.iterator(requiredIds, filters)
val end = if (log.isDebugEnabled) System.currentTimeMillis else 0
_totalRows = fileScanner.totalRows()
logDebug("Construct File Iterator: " + (end - start) + " ms")
iter
}
ParquetDataFile
def iterator(
requiredIds: Array[Int],
filters: Seq[Filter] = Nil): OapCompletionIterator[Any] = {
val iterator = context match {
case Some(c) =>
// Parquet RowGroupCount can more than Int.MaxValue,
// in that sence we should not cache data in memory
// and rollback to read this rowgroup from file directly.
if (parquetDataCacheEnable &&
!meta.footer.getBlocks.asScala.exists(_.getRowCount > Int.MaxValue)) {
addRequestSchemaToConf(configuration, requiredIds)
//goto here
initCacheReader(requiredIds, c,
new VectorizedCacheReader(configuration,
meta.footer.toParquetMetadata(), this, requiredIds))
} else {
addRequestSchemaToConf(configuration, requiredIds)
initVectorizedReader(c,
new VectorizedOapRecordReader(file, configuration, meta.footer))
}
case _ =>
addRequestSchemaToConf(configuration, requiredIds)
initRecordReader(
new MrOapRecordReader[UnsafeRow](new ParquetReadSupportWrapper,
file, configuration, meta.footer))
}
iterator.asInstanceOf[OapCompletionIterator[Any]]
}
VectorizedCacheReader.scala
protected def initializeMetas(): Unit = {
this.fileSchema = footer.getFileMetaData.getSchema
val fileMetadata = footer.getFileMetaData.getKeyValueMetaData
// init go to here
val readContext = new ParquetReadSupportWrapper()
.init(new InitContext(configuration, Collections3.toSetMultiMap(fileMetadata), fileSchema))
this.requestedSchema = readContext.getRequestedSchema
val sparkRequestedSchemaString =
configuration.get(ParquetReadSupportWrapper.SPARK_ROW_REQUESTED_SCHEMA)
this.sparkSchema = StructType.fromString(sparkRequestedSchemaString)
val rowGroupMetas = footer.getBlocks.asScala
this.rowGroupMetaIter = rowGroupMetas.iterator
for (block <- rowGroupMetas) {
this.totalRowCount += block.getRowCount
}
}
ParquetReadSupportWrapper.scala
override def init(context: InitContext): ReadContext = {
readSupport.init(context)
}
ParquetReadSupport.scala
override def init(context: InitContext): ReadContext = {
...
val parquetRequestedSchema =
ParquetReadSupport.clipParquetSchema(context.getFileSchema, catalystRequestedSchema)
...
}