地址:http://aperise.iteye.com/blog/2372505
源码解读--(1)hbase客户端源代码 | http://aperise.iteye.com/blog/2372350 |
源码解读--(2)hbase-examples BufferedMutator Example | http://aperise.iteye.com/blog/2372505 |
源码解读--(3)hbase-examples MultiThreadedClientExample | http://aperise.iteye.com/blog/2372534 |
1.摒弃HTable,直接创建HTable里的BufferedMutator对象操作hbase客户端完全可行
在前面的hbase客户端源代码分析中,我们客户端的创建方式如下:
- //默认connection实现是org.apache.hadoop.hbase.client.ConnectionManager.HConnectionImplementation
- Connection connection = ConnectionFactory.createConnection(configuration);
- //默认table实现是org.apache.hadoop.hbase.client.HTable
- Table table = connection.getTable(TableName.valueOf("tableName"));
- 默认我们拿到了connection的实现org.apache.hadoop.hbase.client.ConnectionManager.HConnectionImplementation,里面我们需要注意的是通过setupRegistry()类设置了与zookeeper交互的重要类org.apache.hadoop.hbase.client.ZookeeperRegistry类,后续与zookeeper交互都由此类完成
- 然后通过connection拿到了table的实现org.apache.hadoop.hbase.client.HTable
- 最后发现org.apache.hadoop.hbase.client.HTable归根结底持有的就是BufferedMutatorImpl类型的属性mutator,所有后续的操作都是基于mutator操作
那么其实我们操作hbase客户端,完全可以摒弃HTable对象,直接构建BufferedMutator,然后操作hbase,正如所料,在hbase的源码模块hbase-examples里也正好提到了这种使用方法,使用的关键代码如下:
- Configuration configuration = HBaseConfiguration.create();
- configuration.set("hbase.zookeeper.property.clientPort", "2181");
- configuration.set("hbase.client.write.buffer", "2097152");
- configuration.set("hbase.zookeeper.quorum","192.168.199.31,192.168.199.32,192.168.199.33,192.168.199.34,192.168.199.35");
- BufferedMutatorParams params = new BufferedMutatorParams(TableName.valueOf("tableName"));
- //3177不是我杜撰的,是2*hbase.client.write.buffer/put.heapSize()计算出来的
- int bestBathPutSize = 3177;
- //这里利用jdk1.7里的新特性try(必须实现java.io.Closeable的对象){}catch (Exception e) {}
- //相当于调用了finally功能,调用(必须实现java.io.Closeable的对象)的close()方法,也即会调用conn.close(),mutator.close()
- try(
- //默认connection实现是org.apache.hadoop.hbase.client.ConnectionManager.HConnectionImplementation
- Connection conn = ConnectionFactory.createConnection(configuration);
- //默认mutator实现是org.apache.hadoop.hbase.client.BufferedMutatorImpl
- BufferedMutator mutator = conn.getBufferedMutator(params);
- ){
- List<Put> putLists = new ArrayList<Put>();
- for(int count=0;count<100000;count++){
- Put put = new Put(rowkey.getBytes());
- put.addImmutable("columnFamily1".getBytes(), "columnName1".getBytes(), "columnValue1".getBytes());
- put.addImmutable("columnFamily1".getBytes(), "columnName2".getBytes(), "columnValue2".getBytes());
- put.addImmutable("columnFamily1".getBytes(), "columnName3".getBytes(), "columnValue3".getBytes());
- put.setDurability(Durability.SKIP_WAL);
- putLists.add(put);
- if(putLists.size()==bestBathPutSize){
- //达到最佳大小值了,马上提交一把
- mutator.mutate(putLists);
- mutator.flush();
- putLists.clear();
- }
- }
- //剩下的未提交数据,最后做一次提交
- mutator.mutate(putLists);
- mutator.flush();
- }catch(IOException e) {
- LOG.info("exception while creating/destroying Connection or BufferedMutator", e);
- }
2.BufferedMutatorParams
BufferedMutatorParams主要是收集构造BufferedMutator对象的参数信息,这些参数包括hbase数据表名、hbase客户端缓冲区、hbase rowkey最大所占空间、线程池和监听hbase操作的回调监听器(比如监听hbase写入失败)
- package org.apache.hadoop.hbase.client;
- import java.util.concurrent.ExecutorService;
- import org.apache.hadoop.hbase.TableName;
- import org.apache.hadoop.hbase.classification.InterfaceAudience;
- import org.apache.hadoop.hbase.classification.InterfaceStability;
- /**
- * 构造BufferedMutator对象的类BufferedMutatorParams
- */
- @InterfaceAudience.Public
- @InterfaceStability.Evolving
- public class BufferedMutatorParams {
- static final int UNSET = -1;
- private final TableName tableName;//hbase数据表
- private long writeBufferSize = UNSET;//hbase客户端缓冲区
- private int maxKeyValueSize = UNSET;//hbase rowkey最大所占空间
- private ExecutorService pool = null;//线程池
- private BufferedMutator.ExceptionListener listener = new BufferedMutator.ExceptionListener() {//监听hbase操作的回调监听器,比如监听hbase写入失败
- @Override
- public void onException(RetriesExhaustedWithDetailsException exception,
- BufferedMutator bufferedMutator)
- throws RetriesExhaustedWithDetailsException {
- throw exception;
- }
- };
- public BufferedMutatorParams(TableName tableName) {//构造方法
- this.tableName = tableName;
- }
- public TableName getTableName() {//获取表名
- return tableName;
- }
- public long getWriteBufferSize() {//获取写缓冲区大小
- return writeBufferSize;
- }
- /**
- * 重写缓冲区设置函数
- */
- public BufferedMutatorParams writeBufferSize(long writeBufferSize) {
- this.writeBufferSize = writeBufferSize;
- return this;
- }
- public int getMaxKeyValueSize() {//获取rowkey所占空间
- return maxKeyValueSize;
- }
- /**
- * 重写设置rowkey所占空间的函数
- */
- public BufferedMutatorParams maxKeyValueSize(int maxKeyValueSize) {
- this.maxKeyValueSize = maxKeyValueSize;
- return this;
- }
- public ExecutorService getPool() {//获取线程池
- return pool;
- }
- public BufferedMutatorParams pool(ExecutorService pool) {//构造函数
- this.pool = pool;
- return this;
- }
- public BufferedMutator.ExceptionListener getListener() {//获取监听器
- return listener;
- }
- public BufferedMutatorParams listener(BufferedMutator.ExceptionListener listener) {//构造函数
- this.listener = listener;
- return this;
- }
- }
3.BufferedMutator
BufferedMutator是一个接口,主要定义了一些抽象方法:
- public interface BufferedMutator extends Closeable {
- TableName getName();//获取表名
- Configuration getConfiguration();//获取hadoop配置对象Configuration
- void mutate(Mutation mutation) throws IOException;//操作缓冲区
- void mutate(List<? extends Mutation> mutations) throws IOException;//批量操作缓冲区
- @Override
- void close() throws IOException;//实现Closeable接口,这样可以利用JDK1.7新特性不写finally就可以关闭对象
- void flush() throws IOException;//想hbase服务端提交数据请求
- long getWriteBufferSize();//获取写缓冲区大小
- @InterfaceAudience.Public
- @InterfaceStability.Evolving
- interface ExceptionListener {//监听器
- public void onException(RetriesExhaustedWithDetailsException exception,
- BufferedMutator mutator) throws RetriesExhaustedWithDetailsException;
- }
- }
4.BufferedMutatorImpl
- package org.apache.hadoop.hbase.client;
- import com.google.common.annotations.VisibleForTesting;
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.hbase.TableName;
- import org.apache.hadoop.hbase.classification.InterfaceAudience;
- import org.apache.hadoop.hbase.classification.InterfaceStability;
- import org.apache.hadoop.hbase.ipc.RpcControllerFactory;
- import java.io.IOException;
- import java.io.InterruptedIOException;
- import java.util.Arrays;
- import java.util.LinkedList;
- import java.util.List;
- import java.util.concurrent.ConcurrentLinkedQueue;
- import java.util.concurrent.ExecutorService;
- import java.util.concurrent.TimeUnit;
- import java.util.concurrent.atomic.AtomicLong;
- /**
- * hbase1.0.0才开始使用BufferedMutatorImpl
- * 主要用于在多线程中操作同一个数据表
- * 需要注意的是多线程中共享一个BufferedMutator对象,如果某个线程中出错,其他线程也会出错
- */
- @InterfaceAudience.Private
- @InterfaceStability.Evolving
- public class BufferedMutatorImpl implements BufferedMutator {
- private static final Log LOG = LogFactory.getLog(BufferedMutatorImpl.class);
- private final ExceptionListener listener;//hbase客户端每次操作的监听回调对象
- protected ClusterConnection connection; //持有的链接
- private final TableName tableName;//hbase数据表
- private volatile Configuration conf;//hadoop配置类Configuration
- @VisibleForTesting
- final ConcurrentLinkedQueue<Mutation> writeAsyncBuffer = new ConcurrentLinkedQueue<Mutation>();//hbase缓冲区队列
- @VisibleForTesting
- AtomicLong currentWriteBufferSize = new AtomicLong(0);//线程安全的长整型值,主要累计当前在缓冲区中数据所占空间大小
- private long writeBufferSize;//hbase客户端缓冲区大小
- private final int maxKeyValueSize;//hbase客户端rowkey所占最大空间
- private boolean closed = false;//hbase客户端是否已经被关闭
- private final ExecutorService pool;//hbase客户端使用的线程池
- @VisibleForTesting
- protected AsyncProcess ap; //hbase客户端异步操作对象
- BufferedMutatorImpl(ClusterConnection conn, RpcRetryingCallerFactory rpcCallerFactory,
- RpcControllerFactory rpcFactory, BufferedMutatorParams params) {
- if (conn == null || conn.isClosed()) {
- throw new IllegalArgumentException("Connection is null or closed.");
- }
- this.tableName = params.getTableName();
- this.connection = conn;
- this.conf = connection.getConfiguration();
- this.pool = params.getPool();
- this.listener = params.getListener();
- //基于传入的conf构建自己的属性ConnectionConfiguration,客户端没有设置的配置会自动使用默认值
- ConnectionConfiguration tableConf = new ConnectionConfiguration(conf);
- //设置缓冲区大小
- this.writeBufferSize = params.getWriteBufferSize() != BufferedMutatorParams.UNSET ? params.getWriteBufferSize() : tableConf.getWriteBufferSize();
- //设置rowkey最大所占空间
- this.maxKeyValueSize = params.getMaxKeyValueSize() != BufferedMutatorParams.UNSET ? params.getMaxKeyValueSize() : tableConf.getMaxKeyValueSize();
- //hbase客户端异步操作对象
- ap = new AsyncProcess(connection, conf, pool, rpcCallerFactory, true, rpcFactory);
- }
- @Override
- public TableName getName() {//获取表名
- return tableName;
- }
- @Override
- public Configuration getConfiguration() {//获取hadoop配置对象Configuration,这里是客户端传入的conf
- return conf;
- }
- @Override
- public void mutate(Mutation m) throws InterruptedIOException,
- RetriesExhaustedWithDetailsException {//操作缓冲区
- mutate(Arrays.asList(m));
- }
- @Override
- public void mutate(List<? extends Mutation> ms) throws InterruptedIOException, RetriesExhaustedWithDetailsException {
- //如果BufferedMutatorImpl已经关闭,直接退出返回
- if (closed) {
- throw new IllegalStateException("Cannot put when the BufferedMutator is closed.");
- }
- //这里先不断循环累计提交的List<Put>记录所占的空间,放置到toAddSize
- long toAddSize = 0;
- for (Mutation m : ms) {
- if (m instanceof Put) {
- validatePut((Put) m);
- }
- toAddSize += m.heapSize();
- }
- // This behavior is highly non-intuitive... it does not protect us against
- // 94-incompatible behavior, which is a timing issue because hasError, the below code
- // and setter of hasError are not synchronized. Perhaps it should be removed.
- if (ap.hasError()) {
- //设置BufferedMutatorImpl当前记录的提交记录所占空间值为toAddSize
- currentWriteBufferSize.addAndGet(toAddSize);
- //把提交的记录List<Put>放置到缓存对象writeAsyncBuffer,在为提交完成前先不进行清理
- writeAsyncBuffer.addAll(ms);
- //这里当捕获到异常时候,再进行异常前的一次数据提交
- backgroundFlushCommits(true);
- } else {
- //设置BufferedMutatorImpl当前记录的提交记录所占空间值为toAddSize
- currentWriteBufferSize.addAndGet(toAddSize);
- //把提交的记录List<Put>放置到缓存对象writeAsyncBuffer,在为提交完成前先不进行清理
- writeAsyncBuffer.addAll(ms);
- }
- // Now try and queue what needs to be queued.
- // 如果当前提交的List<Put>记录所占空间大于hbase.client.write.buffer设置的值,默认2MB,那么就马上调用backgroundFlushCommits方法
- // 如果小于hbase.client.write.buffer设置的值,那么就直接退出,啥也不做
- while (currentWriteBufferSize.get() > writeBufferSize) {
- backgroundFlushCommits(false);
- }
- }
- // 校验Put
- public void validatePut(final Put put) throws IllegalArgumentException {
- HTable.validatePut(put, maxKeyValueSize);
- }
- @Override
- public synchronized void close() throws IOException {
- try {
- if (this.closed) {//如果已经关闭了,直接返回
- return;
- }
- //关闭前做最后一次提交
- backgroundFlushCommits(true);
- this.pool.shutdown();//关闭线程池
- boolean terminated;
- int loopCnt = 0;
- do {
- // wait until the pool has terminated
- terminated = this.pool.awaitTermination(60, TimeUnit.SECONDS);
- loopCnt += 1;
- if (loopCnt >= 10) {
- LOG.warn("close() failed to terminate pool after 10 minutes. Abandoning pool.");
- break;
- }
- } while (!terminated);
- } catch (InterruptedException e) {
- LOG.warn("waitForTermination interrupted");
- } finally {
- this.closed = true;
- }
- }
- @Override
- public synchronized void flush() throws InterruptedIOException, RetriesExhaustedWithDetailsException {
- //主动调用flush提交数据到hbase服务端
- backgroundFlushCommits(true);
- }
- private void backgroundFlushCommits(boolean synchronous) throws InterruptedIOException, RetriesExhaustedWithDetailsException {
- LinkedList<Mutation> buffer = new LinkedList<>();
- // Keep track of the size so that this thread doesn't spin forever
- long dequeuedSize = 0;
- try {
- //分析所有提交的List<Put>,Put是Mutation的实现
- Mutation m;
- //如果(hbase.client.write.buffer <= 0 || 0 < (whbase.client.write.buffer * 2) || synchronous)&& writeAsyncBuffer里仍然有Mutation对象
- //那么就不断计算所占空间大小dequeuedSize
- //currentWriteBufferSize的大小则递减
- while ((writeBufferSize <= 0 || dequeuedSize < (writeBufferSize * 2) || synchronous) && (m = writeAsyncBuffer.poll()) != null) {
- buffer.add(m);
- long size = m.heapSize();
- dequeuedSize += size;
- currentWriteBufferSize.addAndGet(-size);
- }
- //backgroundFlushCommits(false)时候,当List<Put>,这里不会进入
- if (!synchronous && dequeuedSize == 0) {
- return;
- }
- //backgroundFlushCommits(false)时候,这里会进入,并且不会等待结果返回
- if (!synchronous) {
- //不会等待结果返回
- ap.submit(tableName, buffer, true, null, false);
- if (ap.hasError()) {
- LOG.debug(tableName + ": One or more of the operations have failed -"
- + " waiting for all operation in progress to finish (successfully or not)");
- }
- }
- //backgroundFlushCommits(true)时候,这里会进入,并且会等待结果返回
- if (synchronous || ap.hasError()) {
- while (!buffer.isEmpty()) {
- ap.submit(tableName, buffer, true, null, false);
- }
- //会等待结果返回
- RetriesExhaustedWithDetailsException error = ap.waitForAllPreviousOpsAndReset(null);
- if (error != null) {
- if (listener == null) {
- throw error;
- } else {
- this.listener.onException(error, this);
- }
- }
- }
- } finally {
- //如果还有数据,那么给到外面最后提交
- for (Mutation mut : buffer) {
- long size = mut.heapSize();
- currentWriteBufferSize.addAndGet(size);
- dequeuedSize -= size;
- writeAsyncBuffer.add(mut);
- }
- }
- }
- /**
- * 设置hbase客户端缓冲区所占空间大小
- */
- @Deprecated
- public void setWriteBufferSize(long writeBufferSize) throws RetriesExhaustedWithDetailsException,
- InterruptedIOException {
- this.writeBufferSize = writeBufferSize;
- if (currentWriteBufferSize.get() > writeBufferSize) {
- flush();
- }
- }
- /**
- * 获取写缓冲区大小
- */
- @Override
- public long getWriteBufferSize() {
- return this.writeBufferSize;
- }
- @Deprecated
- public List<Row> getWriteBuffer() {
- return Arrays.asList(writeAsyncBuffer.toArray(new Row[0]));
- }
- }
5.BufferedMutatorExample
在hbase的源代码模块hbase-examples里提供了使用hbase客户端的例子,这个java类是BufferedMutatorExample,从这个类里面告诉了我们另外一种操作hbase客户端的实现,其代码如下:
- import org.apache.commons.logging.Log;
- import org.apache.commons.logging.LogFactory;
- import org.apache.hadoop.conf.Configured;
- import org.apache.hadoop.hbase.TableName;
- import org.apache.hadoop.hbase.client.BufferedMutator;
- import org.apache.hadoop.hbase.client.BufferedMutatorParams;
- import org.apache.hadoop.hbase.client.Connection;
- import org.apache.hadoop.hbase.client.ConnectionFactory;
- import org.apache.hadoop.hbase.client.Put;
- import org.apache.hadoop.hbase.client.RetriesExhaustedWithDetailsException;
- import org.apache.hadoop.hbase.util.Bytes;
- import org.apache.hadoop.util.Tool;
- import org.apache.hadoop.util.ToolRunner;
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.concurrent.Callable;
- import java.util.concurrent.ExecutionException;
- import java.util.concurrent.ExecutorService;
- import java.util.concurrent.Executors;
- import java.util.concurrent.Future;
- import java.util.concurrent.TimeUnit;
- import java.util.concurrent.TimeoutException;
- /**
- * An example of using the {@link BufferedMutator} interface.
- */
- public class BufferedMutatorExample extends Configured implements Tool {
- private static final Log LOG = LogFactory.getLog(BufferedMutatorExample.class);
- private static final int POOL_SIZE = 10;// 线程池大小
- private static final int TASK_COUNT = 100;// 任务数
- private static final TableName TABLE = TableName.valueOf("foo");// hbase数据表foo
- private static final byte[] FAMILY = Bytes.toBytes("f");// hbase数据表foo的列簇f
- /**
- * 重写Tool.run(String [] args)方法,传入的是main函数的参数String[] args
- */
- @Override
- public int run(String[] args) throws InterruptedException, ExecutionException, TimeoutException {
- /** 一个异步回调监听器,在hbase write失败的时候触发. */
- final BufferedMutator.ExceptionListener listener = new BufferedMutator.ExceptionListener() {
- @Override
- public void onException(RetriesExhaustedWithDetailsException e, BufferedMutator mutator) {
- for (int i = 0; i < e.getNumExceptions(); i++) {
- LOG.info("Failed to sent put " + e.getRow(i) + ".");
- }
- }
- };
- /**
- * BufferedMutator的构造参数对象BufferedMutatorParams.
- * BufferedMutatorParams参数如下:
- * TableName tableName
- * long writeBufferSize
- * int maxKeyValueSize
- * ExecutorService pool
- * BufferedMutator.ExceptionListener listener
- * 这里只设置了属性tableName和listener
- * */
- BufferedMutatorParams params = new BufferedMutatorParams(TABLE).listener(listener);
- /**
- * step 1: 创建一个连接Connection和BufferedMutator对象,供线程池中的所有线程共享使用
- * 这里利用了jdk1.7里的新特性try(必须实现java.io.Closeable的对象){}catch (Exception e) {},
- * 在调用完毕后会主动调用(必须实现java.io.Closeable的对象)的close()方法,
- * 这里也即默认实现了finally的功能,相当于执行了
- * finally{
- * conn.close();
- * mutator.close();
- * }
- */
- try (
- final Connection conn = ConnectionFactory.createConnection(getConf());
- final BufferedMutator mutator = conn.getBufferedMutator(params)
- ) {
- /** 操作BufferedTable对象的工作线程池,大小为10 */
- final ExecutorService workerPool = Executors.newFixedThreadPool(POOL_SIZE);
- List<Future<Void>> futures = new ArrayList<>(TASK_COUNT);
- /** 不断创建任务,放入线程池执行,任务数为100个 */
- for (int i = 0; i < TASK_COUNT; i++) {
- futures.add(workerPool.submit(new Callable<Void>() {
- @Override
- public Void call() throws Exception {
- /**
- * step 2: 所有任务都共同向BufferedMutator的缓冲区发送数据,
- * 所有任务共享BufferedMutator的缓冲区(hbase.client.write.buffer),
- * 所有任务共享回调监听器listener和线程池
- * */
- /**
- * 这里构造Put对象
- * */
- Put p = new Put(Bytes.toBytes("someRow"));
- p.addColumn(FAMILY, Bytes.toBytes("someQualifier"), Bytes.toBytes("some value"));
- /**
- * 添加数据到BufferedMutator的缓冲区(hbase.client.write.buffer),
- * 这里不会立即提交数据到hbase服务端,只会在缓冲区大小大于hbase.client.write.buffer时候才会主动提交数据到服务端
- * */
- mutator.mutate(p);
- /**
- * TODO
- * 这里你可以在退出本任务前自己主动调用mutator.flush()提交数据到hbase服务端
- * mutator.flush();
- * */
- return null;
- }
- }));
- }
- /**
- * step 3: 遍历每个回调任务的Future,如果未执行完,每个Future等待5分钟
- */
- for (Future<Void> f : futures) {
- f.get(5, TimeUnit.MINUTES);
- }
- /**
- * 最后关闭线程池
- */
- workerPool.shutdown();
- } catch (IOException e) {
- // exception while creating/destroying Connection or BufferedMutator
- LOG.info("exception while creating/destroying Connection or BufferedMutator", e);
- }
- /**
- * 这里没有finally代码,原因是前面用了jdk1.7里的新特性try(必须实现java.io.Closeable的对象){}catch (Exception e) {},
- * 在调用完毕后会主动调用(必须实现java.io.Closeable的对象)的close()方法,也即会调用conn.close(),mutator.close()
- */
- return 0;
- }
- public static void main(String[] args) throws Exception {
- //调用工具类ToolRunner执行实现了接口Tool的对象BufferedMutatorExample的run方法,同时会把String[] args传入BufferedMutatorExample的run方法
- ToolRunner.run(new BufferedMutatorExample(), args);
- }
- }
6.源码收获
- BufferedMutator完全可以用于操作hbase客户端;
- BufferedMutator可以供多线程共享使用;