写入数据:
- public class TestWrit {
- private static Configuration cfg = new Configuration();
- private static final int BLOCK_INDEX_SIZE = 60;
- private static final int BLOOM_BLOCK_INDEX_SIZE = 10;
- public TestWrit() {
- cfg.setInt("hfile.index.block.max.size", BLOCK_INDEX_SIZE);
- cfg.setInt("io.storefile.bloom.block.size", BLOOM_BLOCK_INDEX_SIZE);
- //cfg.setBoolean("hbase.regionserver.checksum.verify", true);
- }
- public static void main(String[] args) throws IOException {
- }
- public void test() throws IOException {
- //指定写入的路径
- Path path = new Path("/data0/hbase/test/myhfile");
- FileSystem fs = FileSystem.get(cfg);
- CacheConfig config = new CacheConfig(cfg);
- FSDataOutputStream fsdos = fs.create(path);
- //MyDataOutputStream mdos = new MyDataOutputStream(fsdos);
- //fsdos = new FSDataOutputStream(mdos);
- //创建压缩算法,文件块编码,比较器
- //HFile默认的比较器是字典排序的,也可以生成一个自定义的比较器,但必须继承KeyComparator
- Algorithm algorithm = Algorithm.GZ;
- HFileDataBlockEncoder encoder = new HFileDataBlockEncoderImpl(DataBlockEncoding.DIFF);
- KeyComparator comparator = new KeyComparator();
- ChecksumType check = ChecksumType.CRC32;
- //创建HFile写实现类,指定写入的数据块大小,多少字节生成一个checksum
- int blockSize = 100;
- int checkPerBytes = 16384;
- HFileWriterV2 v2 = new HFileWriterV2(cfg, config, fs, path, fsdos, blockSize, algorithm,
- encoder, comparator, check, checkPerBytes, true);
- /**
- * HFile默认的比较器是字典排序的,所以插入的key也必须是字典排序,如果不想按照字典排序,
- * 这里使用红黑树保证key的有序性
- String keyPrefix = new String("key");
- TreeSet<String> set = new TreeSet<String>();
- int len = 100;
- for(int i=1;i<=len;i++) {
- set.add(""+i);
- }
- for(String key:set) {
- String generatorKey = keyPrefix+key;
- v2.append( generator(generatorKey,"c","",System.currentTimeMillis(),VALUES) );
- }
- */
- //创建两个布隆过滤器,指定最大的key数为5
- int maxKey = 5;
- BloomFilterWriter bw = BloomFilterFactory.createGeneralBloomAtWrite(cfg, config, BloomType.ROW, maxKey, v2);
- BloomFilterWriter bw2 = BloomFilterFactory.createDeleteBloomAtWrite(cfg, config, maxKey, v2);
- //生成KeyValue,插入到HFile中,并保存到布隆过滤器中
- KeyValue kv = generator("key111111111111111111111111","value","f",System.currentTimeMillis(),new byte[]{'2'});
- addToHFileWirterAndBloomFile(kv,v2,bw,bw2);
- kv = generator("key222222222222222222222222","value","f",System.currentTimeMillis(),new byte[]{'2'});
- addToHFileWirterAndBloomFile(kv,v2,bw,bw2);
- kv = generator("key333333333333333333333333","value","f",System.currentTimeMillis(),new byte[]{'2'});
- addToHFileWirterAndBloomFile(kv,v2,bw,bw2);
- //生成meta块,布隆过滤器块,删除的布隆过滤器块
- //自定义文件信息块的key-value
- //布隆过滤器加入到HFile.Writer时会判断里面是否有数据,所以要先将key插入到布隆过滤器中,再加入到
- //Writerv2中
- v2.addGeneralBloomFilter(bw);
- v2.addDeleteFamilyBloomFilter(bw2);
- v2.appendMetaBlock("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", new MyWritable());
- v2.appendFileInfo(Bytes.toBytes("mykey"), Bytes.toBytes("myvalue"));
- v2.close();
- }
- /**
- * 插入一个KeyValue到HFile中,同时将这个key保存到布隆过滤器中
- */
- public void addToHFileWirterAndBloomFile(KeyValue kv, HFileWriterV2 v2, BloomFilterWriter bw, BloomFilterWriter bw2)
- throws IOException {
- v2.append( kv );
- byte[] buf = bw.createBloomKey(kv.getBuffer(),
- kv.getRowOffset(), kv.getRowLength(), kv.getBuffer(),
- kv.getQualifierOffset(), kv.getQualifierLength());
- bw.add(buf, 0, buf.length);
- bw2.add(buf, 0, buf.length);
- }
- /**
- * 生成KeyValue
- */
- public KeyValue generator(String key,String column,String qualifier,long timestamp,byte[] value) {
- byte[] keyBytes = Bytes.toBytes(key);
- byte[] familyBytes = Bytes.toBytes(column);
- byte[] qualifierBytes = Bytes.toBytes(qualifier);
- Type type = Type.Put;
- byte[] valueBytes = value;
- KeyValue kv = new KeyValue(keyBytes, 0, keyBytes.length, familyBytes, 0, familyBytes.length,
- qualifierBytes, 0, qualifierBytes.length, timestamp, type, valueBytes, 0, valueBytes.length);
- return kv;
- }
- }
写入到磁盘时的内存dump:
读取操作:
- public class TestReader {
- public static String FILE_PATH = "/data0/hbase/test/myhfile";
- public Configuration cfg = new Configuration();
- private FSReader fsBlockReader;
- /**
- * 二级索引长度
- */
- private static final int SECONDARY_INDEX_ENTRY_OVERHEAD = Bytes.SIZEOF_INT + Bytes.SIZEOF_LONG;
- public static void main(String[] args) throws Exception {
- TestReader t = new TestReader();
- t.readBloom();
- }
- /**
- * 解析布隆过滤器
- */
- public void readBloom() throws IOException {
- // 创建读取路径,本地文件系统,两个读取流
- Path path = new Path(FILE_PATH);
- FileSystem fs = FileSystem.getLocal(cfg);
- CacheConfig config = new CacheConfig(cfg);
- // 由HFile创建出Reader实现类
- Reader reader = HFile.createReader(fs, path, config);
- // 创建通用布隆过滤器
- DataInput bloomMeta = reader.getGeneralBloomFilterMetadata();
- BloomFilter bloomFilter = null;
- if (bloomMeta != null) {
- bloomFilter = BloomFilterFactory.createFromMeta(bloomMeta, reader);
- System.out.println(bloomFilter);
- }
- //创建删除的布隆过滤器
- bloomMeta = reader.getDeleteBloomFilterMetadata();
- bloomFilter = null;
- if (bloomMeta != null) {
- bloomFilter = BloomFilterFactory.createFromMeta(bloomMeta, reader);
- System.out.println(bloomFilter);
- }
- //meta的读取实现在 HFileReaderV2#getMetaBlock()中
- }
- /**
- * 使用Scanner读取数据块内容
- */
- @SuppressWarnings("unchecked")
- public void readScan() throws IOException, SecurityException,
- NoSuchMethodException, IllegalArgumentException,
- IllegalAccessException, InvocationTargetException {
- // 创建读取路径,本地文件系统,两个读取流
- Path path = new Path(FILE_PATH);
- FileSystem fs = FileSystem.getLocal(cfg);
- CacheConfig config = new CacheConfig(cfg);
- FSDataInputStream fsdis = fs.open(path);
- FSDataInputStream fsdisNoFsChecksum = fsdis;
- HFileSystem hfs = new HFileSystem(fs);
- long size = fs.getFileStatus(path).getLen();
- // 由读FS读取流,文件长度,就可以读取到尾文件块
- FixedFileTrailer trailer = FixedFileTrailer.readFromStream(fsdis, size);
- // 根据尾文件块,和其他相关信息,创建HFile.Reader实现
- HFileReaderV2 v2 = new HFileReaderV2(path, trailer, fsdis,
- fsdisNoFsChecksum, size, true, config, DataBlockEncoding.NONE,
- hfs);
- System.out.println(v2);
- // 读取FileInfo中的内容
- Method method = v2.getClass().getMethod("loadFileInfo", new Class[] {});
- Map<byte[], byte[]> fileInfo = (Map<byte[], byte[]>) method.invoke(v2,
- new Object[] {});
- Iterator<Entry<byte[], byte[]>> iter = fileInfo.entrySet().iterator();
- while (iter.hasNext()) {
- Entry<byte[], byte[]> entry = iter.next();
- System.out.println(Bytes.toString(entry.getKey()) + " = "
- + Bytes.toShort(entry.getValue()));
- }
- // 由Reader实现创建扫描器Scanner,负责读取数据块
- // 并遍历所有的数据块中的KeyValue
- HFileScanner scanner = v2.getScanner(false, false);
- scanner.seekTo();
- System.out.println(scanner.getKeyValue());
- KeyValue kv = scanner.getKeyValue();
- while (scanner.next()) {
- kv = scanner.getKeyValue();
- System.out.println(kv);
- }
- v2.close();
- }
- /**
- * 解析HFile中的数据索引
- */
- @SuppressWarnings({ "unused", "unchecked" })
- public void readIndex() throws Exception {
- // 创建读取路径,本地文件系统,两个读取流
- // 由读FS读取流,文件长度,就可以读取到尾文件块
- Path path = new Path(FILE_PATH);
- FileSystem fs = FileSystem.getLocal(cfg);
- CacheConfig config = new CacheConfig(cfg);
- FSDataInputStream fsdis = fs.open(path);
- FSDataInputStream fsdisNoFsChecksum = fsdis;
- HFileSystem hfs = new HFileSystem(fs);
- long size = fs.getFileStatus(path).getLen();
- FixedFileTrailer trailer = FixedFileTrailer.readFromStream(fsdis, size);
- // 下面创建的一些类,在Reader实现类的构造函数中也可以找到,创建具体文件读取实现FSReader
- // 由于这个类没有提供对外的创建方式,只能通过反射构造 FSReader
- Compression.Algorithm compressAlgo = trailer.getCompressionCodec();
- Class<?> clazz = Class
- .forName("org.apache.hadoop.hbase.io.hfile.HFileBlock$FSReaderV2");
- java.lang.reflect.Constructor<FSReader> constructor = (Constructor<FSReader>) clazz
- .getConstructor(new Class[] { FSDataInputStream.class,
- FSDataInputStream.class, Compression.Algorithm.class,
- long.class, int.class, HFileSystem.class, Path.class });
- constructor.setAccessible(true);
- fsBlockReader = constructor.newInstance(fsdis, fsdis, compressAlgo,
- size, 0, hfs, path);
- // 创建比较器,比较器是定义在尾文件块中
- RawComparator<byte[]> comparator = FixedFileTrailer
- .createComparator(KeyComparator.class.getName());
- // 创建读取数据块的根索引
- BlockIndexReader dataBlockIndexReader = new HFileBlockIndex.BlockIndexReader(
- comparator, trailer.getNumDataIndexLevels());
- // 创建读取元数据快的根索引
- BlockIndexReader metaBlockIndexReader = new HFileBlockIndex.BlockIndexReader(
- Bytes.BYTES_RAWCOMPARATOR, 1);
- // 创建 HFileBlock 迭代器
- HFileBlock.BlockIterator blockIter = fsBlockReader.blockRange(
- trailer.getLoadOnOpenDataOffset(),
- size - trailer.getTrailerSize());
- // 读取数据文件根索引
- dataBlockIndexReader.readMultiLevelIndexRoot(
- blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
- trailer.getDataIndexCount());
- // 读取元数据根索引
- metaBlockIndexReader.readRootIndex(
- blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
- trailer.getMetaIndexCount());
- // 读取FileInfo块中的信息
- // 由于FileInfo块不是public的,所以定义了一个MyFileInfo,内容跟FileInfo一样
- long fileinfoOffset = trailer.getFileInfoOffset();
- HFileBlock fileinfoBlock = fsBlockReader.readBlockData(fileinfoOffset,
- -1, -1, false);
- MyFileInfo fileinfo = new MyFileInfo();
- fileinfo.readFields(fileinfoBlock.getByteStream());
- int avgKeyLength = Bytes.toInt(fileinfo.get(MyFileInfo.AVG_KEY_LEN));
- int avgValueLength = Bytes
- .toInt(fileinfo.get(MyFileInfo.AVG_VALUE_LEN));
- long entryCount = trailer.getEntryCount();
- System.out.println("avg key length=" + avgKeyLength);
- System.out.println("avg value length=" + avgValueLength);
- System.out.println("entry count=" + entryCount);
- int numDataIndexLevels = trailer.getNumDataIndexLevels();
- if (numDataIndexLevels > 1) {
- // 大于一层
- iteratorRootIndex(dataBlockIndexReader);
- } else {
- // 单根索引
- iteratorSingleIndex(dataBlockIndexReader);
- }
- fsdis.close();
- fsdisNoFsChecksum.close();
- }
- /**
- * 解析单层索引
- */
- public void iteratorSingleIndex(BlockIndexReader dataBlockIndex) {
- for (int i = 0; i < dataBlockIndex.getRootBlockCount(); i++) {
- byte[] keyCell = dataBlockIndex.getRootBlockKey(i);
- int blockDataSize = dataBlockIndex.getRootBlockDataSize(i);
- String rowKey = parseKeyCellRowkey(keyCell);
- System.out.println("rowkey=" + rowKey + "\tdata size="
- + blockDataSize);
- }
- }
- /**
- * 解析多层索引,首先解析根索引
- */
- public void iteratorRootIndex(BlockIndexReader dataBlockIndex)
- throws IOException {
- for (int i = 0; i < dataBlockIndex.getRootBlockCount(); i++) {
- long offset = dataBlockIndex.getRootBlockOffset(i);
- int onDiskSize = dataBlockIndex.getRootBlockDataSize(i);
- iteratorNonRootIndex(offset, onDiskSize);
- }
- }
- /**
- * 递归解析每个中间索引
- */
- public void iteratorNonRootIndex(long offset, int onDiskSize)
- throws IOException {
- HFileBlock block = fsBlockReader.readBlockData(offset, onDiskSize, -1,
- false);
- if (block.getBlockType().equals(BlockType.LEAF_INDEX)) {
- parseLeafIndex(block);
- return;
- }
- // 开始计算中间层索引的 每个key位置
- ByteBuffer buffer = block.getBufferReadOnly();
- buffer = ByteBuffer.wrap(buffer.array(),
- buffer.arrayOffset() + block.headerSize(),
- buffer.limit() - block.headerSize()).slice();
- int indexCount = buffer.getInt();
- // 二级索引全部偏移量,二级索引数据+二级索引总数(int)+索引文件总大小(int)
- int entriesOffset = Bytes.SIZEOF_INT * (indexCount + 2);
- for (int i = 0; i < indexCount; i++) {
- // 二级索引指向的偏移量
- // 如当前遍历到第一个key,那么二级索引偏移量就是 第二个int(第一个是索引总数)
- int indexKeyOffset = buffer.getInt(Bytes.SIZEOF_INT * (i + 1));
- long blockOffsetIndex = buffer.getLong(indexKeyOffset
- + entriesOffset);
- int blockSizeIndex = buffer.getInt(indexKeyOffset + entriesOffset
- + Bytes.SIZEOF_LONG);
- iteratorNonRootIndex(blockOffsetIndex, blockSizeIndex);
- }
- }
- /**
- * 解析叶索引
- */
- public void parseLeafIndex(HFileBlock block) {
- // 开始计算中间层索引的 每个key位置
- ByteBuffer buffer = block.getBufferReadOnly();
- buffer = ByteBuffer.wrap(buffer.array(),
- buffer.arrayOffset() + block.headerSize(),
- buffer.limit() - block.headerSize()).slice();
- int indexCount = buffer.getInt();
- // 二级索引全部偏移量,二级索引数据+二级索引总数(int)+索引文件总大小(int)
- int entriesOffset = Bytes.SIZEOF_INT * (indexCount + 2);
- for (int i = 0; i < indexCount; i++) {
- // 二级索引指向的偏移量
- // 如当前遍历到第一个key,那么二级索引偏移量就是 第二个int(第一个是索引总数)
- int indexKeyOffset = buffer.getInt(Bytes.SIZEOF_INT * (i + 1));
- // 全部二级索引长度+key偏移位置+ 块索引offset(long)+块大小(int)
- // 可以计算出真实的key的偏移位置
- int KeyOffset = entriesOffset + indexKeyOffset
- + SECONDARY_INDEX_ENTRY_OVERHEAD;
- // long blockOffsetIndex =
- // buffer.getLong(indexKeyOffset+entriesOffset);
- int blockSizeIndex = buffer.getInt(indexKeyOffset + entriesOffset
- + Bytes.SIZEOF_LONG);
- // 计算key的长度
- int length = buffer.getInt(Bytes.SIZEOF_INT * (i + 2))
- - indexKeyOffset - SECONDARY_INDEX_ENTRY_OVERHEAD;
- // 一个key
- // cell包含了key长度(2字节),key,family长度(1字节),family,qualifier,timestampe(8字节),keytype(1字节)
- // 这里只需要key就可以了
- byte[] keyCell = new byte[length];
- System.arraycopy(buffer.array(), buffer.arrayOffset() + KeyOffset,
- keyCell, 0, length);
- String rowKey = parseKeyCellRowkey(keyCell);
- System.out.println("rowkey=" + rowKey + "\t blockSizeIndex="
- + blockSizeIndex);
- }
- }
- /**
- * 通过keycell,解析出rowkey
- */
- public static String parseKeyCellRowkey(byte[] cell) {
- if (cell == null || cell.length < 3) {
- throw new IllegalArgumentException("cell length is illegal");
- }
- int high = (cell[0] >> 8) & 0xFF;
- int low = cell[1] & 0xFF;
- int keySize = high + low;
- byte[] key = new byte[keySize];
- System.arraycopy(cell, 2, key, 0, key.length);
- return Bytes.toString(key);
- }
- }
工具类:
- /**
- * 自定义这样的类原因是HBase的实现是非public类
- */
- public class MyFileInfo extends HbaseMapWritable<byte[], byte[]> {
- /**
- * hfile保留的key,以"hfile."开头
- */
- public static final String RESERVED_PREFIX = "hfile.";
- /**
- * hfile前缀的二进制表示
- */
- public static final byte[] RESERVED_PREFIX_BYTES = Bytes
- .toBytes(RESERVED_PREFIX);
- /**
- * last key
- */
- public static final byte[] LASTKEY = Bytes.toBytes(RESERVED_PREFIX + "LASTKEY");
- /**
- * 平均key长度
- */
- public static final byte[] AVG_KEY_LEN = Bytes.toBytes(RESERVED_PREFIX + "AVG_KEY_LEN");
- /**
- * 平均value长度
- */
- public static final byte[] AVG_VALUE_LEN = Bytes.toBytes(RESERVED_PREFIX + "AVG_VALUE_LEN");
- /**
- * 比较器
- */
- public static final byte[] COMPARATOR = Bytes.toBytes(RESERVED_PREFIX + "COMPARATOR");
- /**
- * 增加一个key/value 对到file info中,可选的可以检查key的前缀
- */
- public MyFileInfo append(final byte[] k, final byte[] v, final boolean checkPrefix) throws IOException {
- if (k == null || v == null) {
- throw new NullPointerException("Key nor value may be null");
- }
- if (checkPrefix && isReservedFileInfoKey(k)) {
- throw new IOException("Keys with a " + SaeFileInfo.RESERVED_PREFIX
- + " are reserved");
- }
- put(k, v);
- return this;
- }
- /**
- * 检查当前的key是否以保留的前缀开头的
- */
- public static boolean isReservedFileInfoKey(byte[] key) {
- return Bytes.startsWith(key, SaeFileInfo.RESERVED_PREFIX_BYTES);
- }
- }
- /**
- * 自定义序列化写入实现类
- *
- */
- public class MyWritable implements Writable {
- @Override
- public void readFields(DataInput input) throws IOException {
- input.readInt();
- }
- @Override
- public void write(DataOutput out) throws IOException {
- out.write(123456);
- }
- }