HBase-HFile的读写操作

最新推荐文章于 2022-01-28 09:44:27 发布

peixun123

最新推荐文章于 2022-01-28 09:44:27 发布

阅读量1k

点赞数

分类专栏： hadoop 文章标签：大数据 java shell

本文链接：https://blog.csdn.net/peixun123/article/details/84596109

版权

hadoop 专栏收录该内容

21 篇文章 0 订阅

订阅专栏

写入数据:

public class TestWrit {
	private static Configuration cfg = new Configuration();
	private static final int BLOCK_INDEX_SIZE = 60;
	private static final int BLOOM_BLOCK_INDEX_SIZE = 10;
	public TestWrit() {
		cfg.setInt("hfile.index.block.max.size", BLOCK_INDEX_SIZE);
		cfg.setInt("io.storefile.bloom.block.size", BLOOM_BLOCK_INDEX_SIZE);
		//cfg.setBoolean("hbase.regionserver.checksum.verify", true);
	}
	
	public static void main(String[] args) throws IOException {
	}
	
	public void test() throws IOException {
		//指定写入的路径
		Path path = new Path("/data0/hbase/test/myhfile");		
		FileSystem fs = FileSystem.get(cfg);
		CacheConfig config = new CacheConfig(cfg);
		FSDataOutputStream fsdos = fs.create(path);
		//MyDataOutputStream mdos = new MyDataOutputStream(fsdos);
		//fsdos = new FSDataOutputStream(mdos);
		
		//创建压缩算法，文件块编码，比较器
		//HFile默认的比较器是字典排序的，也可以生成一个自定义的比较器，但必须继承KeyComparator
		Algorithm algorithm = Algorithm.GZ;
		HFileDataBlockEncoder encoder = new HFileDataBlockEncoderImpl(DataBlockEncoding.DIFF);
		KeyComparator comparator = new KeyComparator();
		ChecksumType check = ChecksumType.CRC32;
	
		//创建HFile写实现类，指定写入的数据块大小，多少字节生成一个checksum
		int blockSize = 100;
		int checkPerBytes = 16384;
		HFileWriterV2 v2 = new HFileWriterV2(cfg, config, fs, path, fsdos, blockSize, algorithm, 
				encoder, comparator, check, checkPerBytes, true);
	
	/**
	 * HFile默认的比较器是字典排序的，所以插入的key也必须是字典排序，如果不想按照字典排序，
	 * 这里使用红黑树保证key的有序性
		String keyPrefix = new String("key");
		TreeSet<String> set = new TreeSet<String>();
		int len = 100;
		for(int i=1;i<=len;i++) {
			set.add(""+i);
		}
		for(String key:set) {
			String generatorKey = keyPrefix+key;
			v2.append( generator(generatorKey,"c","",System.currentTimeMillis(),VALUES) );
		}
	*/
		
		//创建两个布隆过滤器，指定最大的key数为5
		int maxKey = 5;
		BloomFilterWriter bw = BloomFilterFactory.createGeneralBloomAtWrite(cfg, config, BloomType.ROW, maxKey, v2);
		BloomFilterWriter bw2 = BloomFilterFactory.createDeleteBloomAtWrite(cfg, config, maxKey, v2);
	
		//生成KeyValue，插入到HFile中，并保存到布隆过滤器中
		KeyValue kv = generator("key111111111111111111111111","value","f",System.currentTimeMillis(),new byte[]{'2'});
		addToHFileWirterAndBloomFile(kv,v2,bw,bw2);
		
		kv = generator("key222222222222222222222222","value","f",System.currentTimeMillis(),new byte[]{'2'});
		addToHFileWirterAndBloomFile(kv,v2,bw,bw2);
		
		kv = generator("key333333333333333333333333","value","f",System.currentTimeMillis(),new byte[]{'2'});
		addToHFileWirterAndBloomFile(kv,v2,bw,bw2);
		
		//生成meta块，布隆过滤器块，删除的布隆过滤器块
		//自定义文件信息块的key-value
		//布隆过滤器加入到HFile.Writer时会判断里面是否有数据，所以要先将key插入到布隆过滤器中，再加入到
		//Writerv2中
		v2.addGeneralBloomFilter(bw);
		v2.addDeleteFamilyBloomFilter(bw2);
		v2.appendMetaBlock("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", new MyWritable());
		v2.appendFileInfo(Bytes.toBytes("mykey"), Bytes.toBytes("myvalue"));
		v2.close();
	}
	
	/**
	 * 插入一个KeyValue到HFile中，同时将这个key保存到布隆过滤器中
	 */
	public void addToHFileWirterAndBloomFile(KeyValue kv, HFileWriterV2 v2, BloomFilterWriter bw, BloomFilterWriter bw2) 
	throws IOException {
		v2.append( kv );
		byte[] buf = bw.createBloomKey(kv.getBuffer(),
                kv.getRowOffset(), kv.getRowLength(), kv.getBuffer(),
                kv.getQualifierOffset(), kv.getQualifierLength());
		bw.add(buf, 0, buf.length);
		bw2.add(buf, 0, buf.length);
	
	}
	
	/**
	 * 生成KeyValue
	 */
	public KeyValue generator(String key,String column,String qualifier,long timestamp,byte[] value) {
		byte[] keyBytes = Bytes.toBytes(key);
		byte[] familyBytes = Bytes.toBytes(column);
		byte[] qualifierBytes = Bytes.toBytes(qualifier);
		Type type = Type.Put;
		byte[] valueBytes = value;
		KeyValue kv = new KeyValue(keyBytes, 0, keyBytes.length, familyBytes, 0, familyBytes.length, 
				qualifierBytes, 0, qualifierBytes.length, timestamp, type, valueBytes, 0, valueBytes.length);		
		return kv;
	}
}

写入到磁盘时的内存dump:

读取操作：

public class TestReader {

	public static String FILE_PATH = "/data0/hbase/test/myhfile";
	public Configuration cfg = new Configuration();
	private FSReader fsBlockReader;
	/**
	 * 二级索引长度
	 */
	private static final int SECONDARY_INDEX_ENTRY_OVERHEAD = Bytes.SIZEOF_INT + Bytes.SIZEOF_LONG;

	public static void main(String[] args) throws Exception {
		TestReader t = new TestReader();
		t.readBloom();
	}

	/**
	 * 解析布隆过滤器
	 */
	public void readBloom() throws IOException {
		// 创建读取路径，本地文件系统，两个读取流
		Path path = new Path(FILE_PATH);
		FileSystem fs = FileSystem.getLocal(cfg);
		CacheConfig config = new CacheConfig(cfg);

		// 由HFile创建出Reader实现类
		Reader reader = HFile.createReader(fs, path, config);

		// 创建通用布隆过滤器
		DataInput bloomMeta = reader.getGeneralBloomFilterMetadata();
		BloomFilter bloomFilter = null;
		if (bloomMeta != null) {
			bloomFilter = BloomFilterFactory.createFromMeta(bloomMeta, reader);
			System.out.println(bloomFilter);
		}

		//创建删除的布隆过滤器
		bloomMeta = reader.getDeleteBloomFilterMetadata();
		bloomFilter = null;
		if (bloomMeta != null) {
			bloomFilter = BloomFilterFactory.createFromMeta(bloomMeta, reader);
			System.out.println(bloomFilter);
		}
		
		//meta的读取实现在  HFileReaderV2#getMetaBlock()中
	}

	/**
	 * 使用Scanner读取数据块内容
	 */
	@SuppressWarnings("unchecked")
	public void readScan() throws IOException, SecurityException,
			NoSuchMethodException, IllegalArgumentException,
			IllegalAccessException, InvocationTargetException {
		// 创建读取路径，本地文件系统，两个读取流
		Path path = new Path(FILE_PATH);
		FileSystem fs = FileSystem.getLocal(cfg);
		CacheConfig config = new CacheConfig(cfg);
		FSDataInputStream fsdis = fs.open(path);
		FSDataInputStream fsdisNoFsChecksum = fsdis;
		HFileSystem hfs = new HFileSystem(fs);
		long size = fs.getFileStatus(path).getLen();

		// 由读FS读取流，文件长度，就可以读取到尾文件块
		FixedFileTrailer trailer = FixedFileTrailer.readFromStream(fsdis, size);

		// 根据尾文件块，和其他相关信息，创建HFile.Reader实现
		HFileReaderV2 v2 = new HFileReaderV2(path, trailer, fsdis,
				fsdisNoFsChecksum, size, true, config, DataBlockEncoding.NONE,
				hfs);
		System.out.println(v2);

		// 读取FileInfo中的内容
		Method method = v2.getClass().getMethod("loadFileInfo", new Class[] {});
		Map<byte[], byte[]> fileInfo = (Map<byte[], byte[]>) method.invoke(v2,
				new Object[] {});
		Iterator<Entry<byte[], byte[]>> iter = fileInfo.entrySet().iterator();
		while (iter.hasNext()) {
			Entry<byte[], byte[]> entry = iter.next();
			System.out.println(Bytes.toString(entry.getKey()) + " = "
					+ Bytes.toShort(entry.getValue()));
		}

		// 由Reader实现创建扫描器Scanner，负责读取数据块
		// 并遍历所有的数据块中的KeyValue
		HFileScanner scanner = v2.getScanner(false, false);
		scanner.seekTo();
		System.out.println(scanner.getKeyValue());

		KeyValue kv = scanner.getKeyValue();
		while (scanner.next()) {
			kv = scanner.getKeyValue();
			System.out.println(kv);
		}
		v2.close();

	}

	/**
	 * 解析HFile中的数据索引
	 */
	@SuppressWarnings({ "unused", "unchecked" })
	public void readIndex() throws Exception {
		// 创建读取路径，本地文件系统，两个读取流
		// 由读FS读取流，文件长度，就可以读取到尾文件块
		Path path = new Path(FILE_PATH);
		FileSystem fs = FileSystem.getLocal(cfg);
		CacheConfig config = new CacheConfig(cfg);
		FSDataInputStream fsdis = fs.open(path);
		FSDataInputStream fsdisNoFsChecksum = fsdis;
		HFileSystem hfs = new HFileSystem(fs);
		long size = fs.getFileStatus(path).getLen();
		FixedFileTrailer trailer = FixedFileTrailer.readFromStream(fsdis, size);

		// 下面创建的一些类，在Reader实现类的构造函数中也可以找到，创建具体文件读取实现FSReader
		// 由于这个类没有提供对外的创建方式，只能通过反射构造 FSReader
		Compression.Algorithm compressAlgo = trailer.getCompressionCodec();
		Class<?> clazz = Class
				.forName("org.apache.hadoop.hbase.io.hfile.HFileBlock$FSReaderV2");
		java.lang.reflect.Constructor<FSReader> constructor = (Constructor<FSReader>) clazz
				.getConstructor(new Class[] { FSDataInputStream.class,
						FSDataInputStream.class, Compression.Algorithm.class,
						long.class, int.class, HFileSystem.class, Path.class });
		constructor.setAccessible(true);
		fsBlockReader = constructor.newInstance(fsdis, fsdis, compressAlgo,
				size, 0, hfs, path);

		// 创建比较器，比较器是定义在尾文件块中
		RawComparator<byte[]> comparator = FixedFileTrailer
				.createComparator(KeyComparator.class.getName());

		// 创建读取数据块的根索引
		BlockIndexReader dataBlockIndexReader = new HFileBlockIndex.BlockIndexReader(
				comparator, trailer.getNumDataIndexLevels());

		// 创建读取元数据快的根索引
		BlockIndexReader metaBlockIndexReader = new HFileBlockIndex.BlockIndexReader(
				Bytes.BYTES_RAWCOMPARATOR, 1);

		// 创建 HFileBlock 迭代器
		HFileBlock.BlockIterator blockIter = fsBlockReader.blockRange(
				trailer.getLoadOnOpenDataOffset(),
				size - trailer.getTrailerSize());

		// 读取数据文件根索引
		dataBlockIndexReader.readMultiLevelIndexRoot(
				blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
				trailer.getDataIndexCount());

		// 读取元数据根索引
		metaBlockIndexReader.readRootIndex(
				blockIter.nextBlockWithBlockType(BlockType.ROOT_INDEX),
				trailer.getMetaIndexCount());

		// 读取FileInfo块中的信息
		// 由于FileInfo块不是public的，所以定义了一个MyFileInfo，内容跟FileInfo一样
		long fileinfoOffset = trailer.getFileInfoOffset();
		HFileBlock fileinfoBlock = fsBlockReader.readBlockData(fileinfoOffset,
				-1, -1, false);
		MyFileInfo fileinfo = new MyFileInfo();
		fileinfo.readFields(fileinfoBlock.getByteStream());
		int avgKeyLength = Bytes.toInt(fileinfo.get(MyFileInfo.AVG_KEY_LEN));
		int avgValueLength = Bytes
				.toInt(fileinfo.get(MyFileInfo.AVG_VALUE_LEN));
		long entryCount = trailer.getEntryCount();
		System.out.println("avg key length=" + avgKeyLength);
		System.out.println("avg value length=" + avgValueLength);
		System.out.println("entry count=" + entryCount);

		int numDataIndexLevels = trailer.getNumDataIndexLevels();
		if (numDataIndexLevels > 1) {
			// 大于一层
			iteratorRootIndex(dataBlockIndexReader);
		} else {
			// 单根索引
			iteratorSingleIndex(dataBlockIndexReader);
		}

		fsdis.close();
		fsdisNoFsChecksum.close();
	}

	/**
	 * 解析单层索引
	 */
	public void iteratorSingleIndex(BlockIndexReader dataBlockIndex) {
		for (int i = 0; i < dataBlockIndex.getRootBlockCount(); i++) {
			byte[] keyCell = dataBlockIndex.getRootBlockKey(i);
			int blockDataSize = dataBlockIndex.getRootBlockDataSize(i);
			String rowKey = parseKeyCellRowkey(keyCell);
			System.out.println("rowkey=" + rowKey + "\tdata size="
					+ blockDataSize);
		}
	}

	/**
	 * 解析多层索引，首先解析根索引
	 */
	public void iteratorRootIndex(BlockIndexReader dataBlockIndex)
			throws IOException {
		for (int i = 0; i < dataBlockIndex.getRootBlockCount(); i++) {
			long offset = dataBlockIndex.getRootBlockOffset(i);
			int onDiskSize = dataBlockIndex.getRootBlockDataSize(i);
			iteratorNonRootIndex(offset, onDiskSize);
		}
	}

	/**
	 * 递归解析每个中间索引
	 */
	public void iteratorNonRootIndex(long offset, int onDiskSize)
			throws IOException {
		HFileBlock block = fsBlockReader.readBlockData(offset, onDiskSize, -1,
				false);
		if (block.getBlockType().equals(BlockType.LEAF_INDEX)) {
			parseLeafIndex(block);
			return;
		}
		// 开始计算中间层索引的 每个key位置
		ByteBuffer buffer = block.getBufferReadOnly();

		buffer = ByteBuffer.wrap(buffer.array(),
				buffer.arrayOffset() + block.headerSize(),
				buffer.limit() - block.headerSize()).slice();
		int indexCount = buffer.getInt();

		// 二级索引全部偏移量，二级索引数据+二级索引总数(int)+索引文件总大小(int)
		int entriesOffset = Bytes.SIZEOF_INT * (indexCount + 2);
		for (int i = 0; i < indexCount; i++) {
			// 二级索引指向的偏移量
			// 如当前遍历到第一个key，那么二级索引偏移量就是 第二个int(第一个是索引总数)
			int indexKeyOffset = buffer.getInt(Bytes.SIZEOF_INT * (i + 1));
			long blockOffsetIndex = buffer.getLong(indexKeyOffset
					+ entriesOffset);
			int blockSizeIndex = buffer.getInt(indexKeyOffset + entriesOffset
					+ Bytes.SIZEOF_LONG);
			iteratorNonRootIndex(blockOffsetIndex, blockSizeIndex);
		}
	}

	/**
	 * 解析叶索引
	 */
	public void parseLeafIndex(HFileBlock block) {
		// 开始计算中间层索引的 每个key位置
		ByteBuffer buffer = block.getBufferReadOnly();
		buffer = ByteBuffer.wrap(buffer.array(),
				buffer.arrayOffset() + block.headerSize(),
				buffer.limit() - block.headerSize()).slice();
		int indexCount = buffer.getInt();

		// 二级索引全部偏移量，二级索引数据+二级索引总数(int)+索引文件总大小(int)
		int entriesOffset = Bytes.SIZEOF_INT * (indexCount + 2);
		for (int i = 0; i < indexCount; i++) {
			// 二级索引指向的偏移量
			// 如当前遍历到第一个key，那么二级索引偏移量就是 第二个int(第一个是索引总数)
			int indexKeyOffset = buffer.getInt(Bytes.SIZEOF_INT * (i + 1));

			// 全部二级索引长度+key偏移位置+ 块索引offset(long)+块大小(int)
			// 可以计算出真实的key的偏移位置
			int KeyOffset = entriesOffset + indexKeyOffset
					+ SECONDARY_INDEX_ENTRY_OVERHEAD;
			// long blockOffsetIndex =
			// buffer.getLong(indexKeyOffset+entriesOffset);
			int blockSizeIndex = buffer.getInt(indexKeyOffset + entriesOffset
					+ Bytes.SIZEOF_LONG);

			// 计算key的长度
			int length = buffer.getInt(Bytes.SIZEOF_INT * (i + 2))
					- indexKeyOffset - SECONDARY_INDEX_ENTRY_OVERHEAD;

			// 一个key
			// cell包含了key长度(2字节),key,family长度(1字节),family,qualifier,timestampe(8字节),keytype(1字节)
			// 这里只需要key就可以了
			byte[] keyCell = new byte[length];
			System.arraycopy(buffer.array(), buffer.arrayOffset() + KeyOffset,
					keyCell, 0, length);

			String rowKey = parseKeyCellRowkey(keyCell);
			System.out.println("rowkey=" + rowKey + "\t blockSizeIndex="
					+ blockSizeIndex);
		}
	}

	/**
	 * 通过keycell，解析出rowkey
	 */
	public static String parseKeyCellRowkey(byte[] cell) {
		if (cell == null || cell.length < 3) {
			throw new IllegalArgumentException("cell length is illegal");
		}
		int high = (cell[0] >> 8) & 0xFF;
		int low = cell[1] & 0xFF;
		int keySize = high + low;
		byte[] key = new byte[keySize];
		System.arraycopy(cell, 2, key, 0, key.length);
		return Bytes.toString(key);
	}

}

工具类：

/**
 * 自定义这样的类原因是HBase的实现是非public类
 */
public class MyFileInfo extends HbaseMapWritable<byte[], byte[]> {
	/**
	 * hfile保留的key，以"hfile."开头
	 */
	public static final String RESERVED_PREFIX = "hfile.";
	
	/**
	 * hfile前缀的二进制表示
	 */
	public static final byte[] RESERVED_PREFIX_BYTES = Bytes
			.toBytes(RESERVED_PREFIX);
	
	/**
	 * last key
	 */
	public static final byte[] LASTKEY = Bytes.toBytes(RESERVED_PREFIX + "LASTKEY");
	
	/**
	 * 平均key长度
	 */
	public static final byte[] AVG_KEY_LEN = Bytes.toBytes(RESERVED_PREFIX + "AVG_KEY_LEN");
	
	/**
	 * 平均value长度
	 */
	public static final byte[] AVG_VALUE_LEN = Bytes.toBytes(RESERVED_PREFIX + "AVG_VALUE_LEN");
	
	/**
	 * 比较器
	 */
	public static final byte[] COMPARATOR = Bytes.toBytes(RESERVED_PREFIX + "COMPARATOR");

	/**
	 * 增加一个key/value 对到file info中，可选的可以检查key的前缀
	 */
	public MyFileInfo append(final byte[] k, final byte[] v, final boolean checkPrefix) throws IOException {
		if (k == null || v == null) {
			throw new NullPointerException("Key nor value may be null");
		}
		if (checkPrefix && isReservedFileInfoKey(k)) {
			throw new IOException("Keys with a " + SaeFileInfo.RESERVED_PREFIX
					+ " are reserved");
		}
		put(k, v);
		return this;
	}

	/**
	 * 检查当前的key是否以保留的前缀开头的
	 */
	public static boolean isReservedFileInfoKey(byte[] key) {
		return Bytes.startsWith(key, SaeFileInfo.RESERVED_PREFIX_BYTES);
	}

}



/**
 * 自定义序列化写入实现类
 *
 */
public class MyWritable implements Writable {

	@Override
	public void readFields(DataInput input) throws IOException {
		input.readInt();
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.write(123456);
	}
}