RecordReaderImpl.java
@Override
public boolean nextBatch(VectorizedRowBatch batch) throws IOException {
try {
if (rowInStripe >= rowCountInStripe) {
currentStripe += 1;
if (currentStripe >= stripes.size()) {
batch.size = 0;
return false;
}
readStripe();
}
int batchSize = computeBatchSize(batch.getMaxSize());
rowInStripe += batchSize;
reader.setVectorColumnCount(batch.getDataColumnCount());
reader.nextBatch(batch, batchSize);
batch.selectedInUse = false;
batch.size = batchSize;
advanceToNextRow(reader, rowInStripe + rowBaseInStripe, true);
return batch.size != 0;
} catch (IOException e) {
// Rethrow exception with file name in log message
throw new IOException("Error reading file: " + path, e);
}
}
rowCountInStripe
rowCountInStripe 是当前 stripe 中所有rowcount
rowBaseInStripe 是基础的截止到上一个stripe 的累积的rowcount
rowInStripe 是该stripe 的读取的位置
private StripeInformation beginReadStripe() throws IOException {
StripeInformation stripe = stripes.get(currentStripe);
stripeFooter = readStripeFooter(stripe);
clearStreams();
// setup the position in the stripe
rowCountInStripe = stripe.getNumberOfRows();
rowInStripe = 0;
rowBaseInStripe = 0;
for (int i = 0; i < currentStripe; ++i) {
rowBaseInStripe += stripes.get(i).getNumberOfRows();
}
// reset all of the indexes
for (int i = 0; i < indexes.length; ++i) {
indexes[i] = null;
}
return stripe;
}
totalRowCount
各个stripe 加起来的累计行
for(StripeInformation stripe: fileReader.getStripes()) {
long stripeStart = stripe.getOffset();
if (offset > stripeStart) {
skippedRows += stripe.getNumberOfRows();
} else if (stripeStart < maxOffset) {
this.stripes.add(stripe);
rows += stripe.getNumberOfRows();
}
}
...
totalRowCount = rows;
computeBatchSize
该方法每次在一个stripe 抓取一个batch
private int computeBatchSize(long targetBatchSize) {
final int batchSize;
// In case of PPD, batch size should be aware of row group boundaries. If only a subset of row
// groups are selected then marker position is set to the end of range (subset of row groups
// within strip). Batch size computed out of marker position makes sure that batch size is
// aware of row group boundary and will not cause overflow when reading rows
// illustration of this case is here https://issues.apache.org/jira/browse/HIVE-6287
if (rowIndexStride != 0 && includedRowGroups != null && rowInStripe < rowCountInStripe) {
int startRowGroup = (int) (rowInStripe / rowIndexStride);
if (!includedRowGroups[startRowGroup]) {
while (startRowGroup < includedRowGroups.length && !includedRowGroups[startRowGroup]) {
startRowGroup += 1;
}
}
int endRowGroup = startRowGroup;
while (endRowGroup < includedRowGroups.length && includedRowGroups[endRowGroup]) {
endRowGroup += 1;
}
final long markerPosition =
(endRowGroup * rowIndexStride) < rowCountInStripe ? (endRowGroup * rowIndexStride)
: rowCountInStripe;
batchSize = (int) Math.min(targetBatchSize, (markerPosition - rowInStripe));
if (isLogDebugEnabled && batchSize < targetBatchSize) {
LOG.debug("markerPosition: " + markerPosition + " batchSize: " + batchSize);
}
} else {
//这里的判断 确保了stripe 的边界问题
batchSize = (int) Math.min(targetBatchSize, (rowCountInStripe - rowInStripe));
}
return batchSize;
}
org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch.java
TODO
StripeInformation.java
元数据类
Information about the stripes in an ORC file that is provided by the Reader.
public interface StripeInformation {
/**
* Get the byte offset of the start of the stripe.
* @return the bytes from the start of the file
*/
long getOffset();
/**
* Get the total length of the stripe in bytes.
* @return the number of bytes in the stripe
*/
long getLength();
/**
* Get the length of the stripe's indexes.
* @return the number of bytes in the index
*/
long getIndexLength();
/**
* Get the length of the stripe's data.
* @return the number of bytes in the stripe
*/
long getDataLength();
/**
* Get the length of the stripe's tail section, which contains its index.
* @return the number of bytes in the tail
*/
long getFooterLength();
/**
* Get the number of rows in the stripe.
* @return a count of the number of rows
*/
long getNumberOfRows();
}