cassandra的sstable为一组db文件,除了数据data.db文件和记录rowkey bloomfilter的filter.db文件,还有index.db文件和static.db文件等。而index.db文件主要是记录着data.db文件中的rowkey的位置,来帮助在rowkey的检索的时候,迅速地定位到data.db中的位置。
indexinfo主要应用于两处:一处是rowkey的检索,一处是IColumn name的检索。
在cassandra1.2版本以前,index.db文件中记录则每个rowkey和对应的row在data.db文件中的位置。
IColumn name的indexinfo主要结构是一个聚合索引,聚合的程度为cassandra.yaml文件中配置项column_index_size_in_kb: 64决定的。遍历整个Row的每一个IColumn,将每一个对象的序列化长度相加,当长度大于column_index_size_in_kb的时候,则生成一个IndexInfo对象,这个对象有四个属性:width为一组对象序列化以后的长度;firstName为一组对象中起始对象的name;lastName为一组对象中最后对象的name;offset为一组对象序列化到data.db文件中的起始位置。
public final long width;
public final ByteBuffer lastName;
public final ByteBuffer firstName;
public final long offset;
cassandra1.2版本以后,cassandra的序列化中增加了范围删除标记RangeTombstone,所以生成indexinfo的时候,不再是简单的遍历整个CF就可以了。
改进点1:序列化中增加了RangeTombstone
生成indexinfo信息的时候,首先遍历deletionInfo中的范围删除标记,如果范围起始的位置大于下一个遍历的IColumn的name的时候,则先序列化RangeTombstone。
* Serializes the index into in-memory structure with all required components
* such as Bloom Filter, index block size, IndexInfo list
*
* @param cf Column family to create index for
*
* @return information about index - it's Bloom Filter, block size and IndexInfo list
*/
public ColumnIndex build(ColumnFamily cf) throws IOException
{
Iterator<RangeTombstone> rangeIter = cf.deletionInfo().rangeIterator();
RangeTombstone tombstone = rangeIter.hasNext() ? rangeIter.next() : null;
Comparator<ByteBuffer> comparator = cf.getComparator();
for (IColumn c : cf)
{
while (tombstone != null && comparator.compare(c.name(), tombstone.min) >= 0)
{
add(tombstone);
tombstone = rangeIter.hasNext() ? rangeIter.next() : null;
}
add(c);
}
while (tombstone != null)
{
add(tombstone);
tombstone = rangeIter.hasNext() ? rangeIter.next() : null;
}
return build();
}
public void add(OnDiskAtom column) throws IOException
{
atomCount++;
if (column instanceof IColumn)
result.bloomFilter.add(column.name());
if (firstColumn == null)
{
firstColumn = column;
startPosition = endPosition;
// TODO: have that use the firstColumn as min + make sure we
// optimize that on read
endPosition += tombstoneTracker.writeOpenedMarker(firstColumn, output, atomSerializer);
blockSize = 0; // We don't count repeated tombstone marker in the block size, to avoid a situation
// where we wouldn't make any problem because a block is filled by said marker
}
long size = column.serializedSizeForSSTable();
endPosition += size;
blockSize += size;
// if we hit the column index size that we have to index after, go ahead and index it.
if (blockSize >= DatabaseDescriptor.getColumnIndexSize())
{
IndexHelper.IndexInfo cIndexInfo = new IndexHelper.IndexInfo(firstColumn.name(), column.name(), indexOffset + startPosition, endPosition - startPosition);
result.columnsIndex.add(cIndexInfo);
firstColumn = null;
lastBlockClosing = column;
}
if (output != null)
atomSerializer.serializeForSSTable(column, output);
// TODO: Should deal with removing unneeded tombstones
tombstoneTracker.update(column);
lastColumn = column;
}
改进点2:index.db文件不再是rowkey和对应row在data.db文件中的偏移量。
/**
* Encapsulates writing the index and filter for an SSTable. The state of this object is not valid until it has been closed.
*/
class IndexWriter implements Closeable
{
private final SequentialWriter indexFile;
public final SegmentedFile.Builder builder;
public final IndexSummary summary;
public final IFilter bf;
private FileMark mark;
IndexWriter(long keyCount)
{
indexFile = SequentialWriter.open(new File(descriptor.filenameFor(SSTable.COMPONENT_INDEX)),
!metadata.populateIoCacheOnFlush());
builder = SegmentedFile.getBuilder(DatabaseDescriptor.getIndexAccessMode());
summary = new IndexSummary(keyCount);
bf = FilterFactory.getFilter(keyCount, metadata.getBloomFilterFpChance(), true);
}
public void append(DecoratedKey key, RowIndexEntry indexEntry)
{
bf.add(key.key);
long indexPosition = indexFile.getFilePointer();
try
{
ByteBufferUtil.writeWithShortLength(key.key, indexFile.stream);
RowIndexEntry.serializer.serialize(indexEntry, indexFile.stream);
}
catch (IOException e)
{
throw new FSWriteError(e, indexFile.getPath());
}
if (logger.isTraceEnabled())
logger.trace("wrote index entry: " + indexEntry + " at " + indexPosition);
summary.maybeAddEntry(key, indexPosition);
builder.addPotentialBoundary(indexPosition);
}
/**
* Closes the index and bloomfilter, making the public state of this writer valid for consumption.
*/
public void close()
{
if (components.contains(Component.FILTER))
{
String path = descriptor.filenameFor(SSTable.COMPONENT_FILTER);
try
{
// bloom filter
FileOutputStream fos = new FileOutputStream(path);
DataOutputStream stream = new DataOutputStream(fos);
FilterFactory.serialize(bf, stream, descriptor.version.filterType);
stream.flush();
fos.getFD().sync();
stream.close();
}
catch (IOException e)
{
throw new FSWriteError(e, path);
}
}
// index
long position = indexFile.getFilePointer();
indexFile.close(); // calls force
FileUtils.truncate(indexFile.getPath(), position);
// finalize in-memory index state
summary.complete();
}
public void mark()
{
mark = indexFile.mark();
}
public void resetAndTruncate()
{
// we can't un-set the bloom filter addition, but extra keys in there are harmless.
// we can't reset dbuilder either, but that is the last thing called in afterappend so
// we assume that if that worked then we won't be trying to reset.
indexFile.resetAndTruncate(mark);
}
@Override
public String toString()
{
return "IndexWriter(" + descriptor + ")";
}
}