OrcColumnarBatchReader.java
/**
* Return true if there exists more data in the next batch. If exists, prepare the next batch
* by copying from ORC VectorizedRowBatch columns to Spark ColumnarBatch columns.
*/
private boolean nextBatch() throws IOException {
recordReader.nextBatch(batch);
int batchSize = batch.size;
if (batchSize == 0) {
return false;
}
columnarBatch.setNumRows(batchSize);
if (!copyToSpark) {
for (int i = 0; i < requiredFields.length; i++) {
if (requestedColIds[i] != -1) {
//是存放的引用,所以设置一下大小即可
((OrcColumnVector) orcVectorWrappers[i]).setBatchSize(batchSize);
}
}
return true;
}
for (WritableColumnVector vector : columnVectors) {
vector.reset();
}
for (int i = 0; i < requiredFields.length; i++) {
StructField field = requiredFields[i];
WritableColumnVector toColumn = columnVectors[i];
if (requestedColIds[i] >= 0) {
// 是获取的所有行的数据,然后按需
ColumnVector fromColumn = batch.cols[requestedColIds[i]];
if (fromColumn.isRepeating) {
putRepeatingValues(batchSize, field, fromColumn, toColumn);
} else if (fromColumn.noNulls) {
putNonNullValues(batchSize, field, fromColumn, toColumn);
} else {
putValues(batchSize, field, fromColumn, toColumn);
}
}
}
return true;
}
orc.enableVectorizedReader
// SQLConf.scala
val ORC_VECTORIZED_READER_ENABLED = buildConf("spark.sql.orc.enableVectorizedReader")
.doc("Enables vectorized orc decoding.")
.booleanConf
.createWithDefault(true)