HBase的批量put操作主要步骤
1.同个region的put视为同一批操作
2.对批量操作按rowkey进行字节排序
Collections.sort(actionsForRegion);
3.检查region server的全局内存是否超过阀值,如超过则唤醒flush线程进行flush操作
public void reclaimMemStoreMemory() {
//如果超过高水位,默认为堆内存的0.4,阻塞rpc线程直到内存减少到预期
if (isAboveHighWaterMark()) {
lock.lock();
try {
boolean blocked = false;
long startTime = 0;
while (isAboveHighWaterMark() && !server.isStopped()) {
.....
//给flush线程提交一个task
wakeupFlushThread();
try {
// we should be able to wait forever, but we've seen a bug where
// we miss a notify, so put a 5 second bound on it at least.
flushOccurred.await(5, TimeUnit.SECONDS);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
}
}
....
} finally {
lock.unlock();
}
}
//如果超过低水位,默认为堆内存的0.35,给flush线程提交一个task,不阻塞线程
else if (isAboveLowWaterMark()) {
wakeupFlushThread();
}
}
4.检查这个region的memstore内存大小是否超过限制,超过则唤醒flush线程对该region进行flush,异步操作
private void checkResources()
throws RegionTooBusyException, InterruptedIOException {
.....
boolean blocked = false;
long startTime = 0;
//当前region内存大小超过blockingMemStoreSize,默认为memstoreFlushSize的2被,memstoreFlushSize默认128M
while (this.memstoreSize.get() > this.blockingMemStoreSize) {
//给flush线程发个请求
requestFlush();
。。。。。
blocked = true;
//等待一段时间,10s
synchronized(this) {
try {
wait(Math.min(timeToWait, threadWakeFrequency));
} catch (InterruptedException ie) {
final long totalTime = EnvironmentEdgeManager.currentTimeMillis() - startTime;
if (totalTime > 0) {
this.updatesBlockedMs.add(totalTime);
}
LOG.info("Interrupted while waiting to unblock updates for region "
+ this + " '" + Thread.currentThread().getName() + "'");
InterruptedIOException iie = new InterruptedIOException();
iie.initCause(ie);
throw iie;
}
}
}
......
}
5.拿行锁,如果拿不到锁,则不处理
private Integer internalObtainRowLock(final byte[] row, boolean waitForLock)
throws IOException {
//检查row的范围是否在这个region里
checkRow(row, "row lock");
startRegionOperation();
try {
HashedBytes rowKey = new HashedBytes(row);
//行锁是一个Latch,释放的时候Latch减1,等待线程就会被唤醒
CountDownLatch rowLatch = new CountDownLatch(1);
// loop until we acquire the row lock (unless !waitForLock)
while (true) {
//put一把
CountDownLatch existingLatch = lockedRows.putIfAbsent(rowKey, rowLatch);
//如果锁不存在,则认为拿到锁
if (existingLatch == null) {
break;
}
//已经有锁了,则等待锁释放或超时
else {
// row already locked
if (!waitForLock) {
return null;
}
try {
if (!existingLatch.await(this.rowLockWaitDuration,
TimeUnit.MILLISECONDS)) {
throw new IOException("Timed out on getting lock for row="
+ Bytes.toStringBinary(row));
}
} catch (InterruptedException ie) {
// Empty
}
}
}
// loop until we generate an unused lock id
//锁id是一个原子递增的整数
while (true) {
Integer lockId = lockIdGenerator.incrementAndGet();
HashedBytes existingRowKey = lockIds.putIfAbsent(lockId, rowKey);
if (existingRowKey == null) {
return lockId;
} else {
// lockId already in use, jump generator to a new spot
lockIdGenerator.set(rand.nextInt());
}
}
} finally {
closeRegionOperation();
}
}
6.修改KeyValue的timestamp为当前时间
7.拿mvcc的写事务id
public WriteEntry beginMemstoreInsert() {
synchronized (writeQueue) {
//事务id是一个原子递增的long
long nextWriteNumber = ++memstoreWrite;
//entry用来存这个事务的状态,是否已完成
WriteEntry e = new WriteEntry(nextWriteNumber);
writeQueue.add(e);
return e;
}
}
8.写入memstore的内存kv列表
private long internalAdd(final KeyValue toAdd) {
//堆内存加了多少
long s = heapSizeChange(toAdd, this.kvset.add(toAdd));
timeRangeTracker.includeTimestamp(toAdd);
this.size.addAndGet(s);
return s;
}
9.写Hlog,但不flush,仍在内存
private long append(HRegionInfo info, byte [] tableName, WALEdit edits, UUID clusterId,
final long now, HTableDescriptor htd, boolean doSync)
throws IOException {
......
long txid = 0;
synchronized (this.updateLock) {
//log的序列号
long seqNum = obtainSeqNum();
// The 'lastSeqWritten' map holds the sequence number of the oldest
// write for each region (i.e. the first edit added to the particular
// memstore). . When the cache is flushed, the entry for the
// region being flushed is removed if the sequence number of the flush
// is greater than or equal to the value in lastSeqWritten.
// Use encoded name. Its shorter, guaranteed unique and a subset of
// actual name.
byte [] encodedRegionName = info.getEncodedNameAsBytes();
//region第一个修改的事务id,flush时所有大于等于该值的entry都会被写入文件
this.lastSeqWritten.putIfAbsent(encodedRegionName, seqNum);
HLogKey logKey = makeKey(encodedRegionName, tableName, seqNum, now, clusterId);
doWrite(info, logKey, edits, htd);
this.numEntries.incrementAndGet();
//事务id,代表第几条log
txid = this.unflushedEntries.incrementAndGet();
if (htd.isDeferredLogFlush()) {
lastDeferredTxid = txid;
}
}
// Sync if catalog region, and if not then check if that table supports
// deferred log flushing
if (doSync &&
(info.isMetaRegion() ||
!htd.isDeferredLogFlush())) {
// sync txn to file system
this.sync(txid);
}
return txid;
}
写log的cache
// appends new writes to the pendingWrites. It is better to keep it in
// our own queue rather than writing it to the HDFS output stream because
// HDFSOutputStream.writeChunk is not lightweight at all.
synchronized void append(Entry e) throws IOException {
pendingWrites.add(e);
}
10.释放行锁
public void releaseRowLock(final Integer lockId) {
if (lockId == null) return; // null lock id, do nothing
//先删除lock id
HashedBytes rowKey = lockIds.remove(lockId);
if (rowKey == null) {
LOG.warn("Release unknown lockId: " + lockId);
return;
}
//再删除lock
CountDownLatch rowLatch = lockedRows.remove(rowKey);
if (rowLatch == null) {
LOG.error("Releases row not locked, lockId: " + lockId + " row: "
+ rowKey);
return;
}
//lock释放
rowLatch.countDown();
}
11.flush Hlog到HDFS
// sync all transactions upto the specified txid
private void syncer(long txid) throws IOException {
Writer tempWriter;
synchronized (this.updateLock) {
if (this.closed) return;
tempWriter = this.writer; // guaranteed non-null
}
// if the transaction that we are interested in is already
// synced, then return immediately.
//当前flush到第一个日志了,有可能已经被其他rpc线程flush掉了
if (txid <= this.syncedTillHere) {
return;
}
try {
long doneUpto;
long now = System.currentTimeMillis();
// First flush all the pending writes to HDFS. Then
// issue the sync to HDFS. If sync is successful, then update
// syncedTillHere to indicate that transactions till this
// number has been successfully synced.
synchronized (flushLock) {
if (txid <= this.syncedTillHere) {
return;
}
doneUpto = this.unflushedEntries.get();
//当前所有cache的log
List<Entry> pending = logSyncerThread.getPendingWrites();
try {
//写,但没sync到HDFS
logSyncerThread.hlogFlush(tempWriter, pending);
} catch(IOException io) {
synchronized (this.updateLock) {
// HBASE-4387, HBASE-5623, retry with updateLock held
tempWriter = this.writer;
logSyncerThread.hlogFlush(tempWriter, pending);
}
}
}
// another thread might have sync'ed avoid double-sync'ing
if (txid <= this.syncedTillHere) {
return;
}
try {
//sync到HDFS,写失败重试一次
tempWriter.sync();
} catch(IOException io) {
synchronized (this.updateLock) {
// HBASE-4387, HBASE-5623, retry with updateLock held
tempWriter = this.writer;
tempWriter.sync();
}
}
//当前已sync的日志
this.syncedTillHere = Math.max(this.syncedTillHere, doneUpto);
......
} catch (IOException e) {
LOG.fatal("Could not sync. Requesting close of hlog", e);
//回滚。
requestLogRoll();
throw e;
}
}
@Override
public void append(HLog.Entry entry) throws IOException {
entry.setCompressionContext(compressionContext);
try {
//SequenceFile写入
this.writer.append(entry.getKey(), entry.getEdit());
} catch (NullPointerException npe) {
// Concurrent close...
throw new IOException(npe);
}
}
12.修改mvcc的读事务id
public void completeMemstoreInsert(WriteEntry e) {
//递增读事务id
advanceMemstore(e);
//等待之前的请求全部完成
waitForRead(e);
}
boolean advanceMemstore(WriteEntry e) {
synchronized (writeQueue) {
//事务结束
e.markCompleted();
long nextReadValue = -1;
boolean ranOnce=false;
//遍历队列,拿到最近已完成的事务id,如果中间有一个请求还未完成,则可能拿到的事务id比当前事务小
while (!writeQueue.isEmpty()) {
ranOnce=true;
WriteEntry queueFirst = writeQueue.getFirst();
if (nextReadValue > 0) {
if (nextReadValue+1 != queueFirst.getWriteNumber()) {
throw new RuntimeException("invariant in completeMemstoreInsert violated, prev: "
+ nextReadValue + " next: " + queueFirst.getWriteNumber());
}
}
if (queueFirst.isCompleted()) {
nextReadValue = queueFirst.getWriteNumber();
writeQueue.removeFirst();
} else {
break;
}
}
if (!ranOnce) {
throw new RuntimeException("never was a first");
}
//修改读事务的id,所有小于该id的事务都已完成,对read可见
if (nextReadValue > 0) {
synchronized (readWaiters) {
memstoreRead = nextReadValue;
readWaiters.notifyAll();
}
}
if (memstoreRead >= e.getWriteNumber()) {
return true;
}
return false;
}
}
/**
* Wait for the global readPoint to advance upto
* the specified transaction number.
*/
public void waitForRead(WriteEntry e) {
boolean interrupted = false;
synchronized (readWaiters) {
//如果前面请求还未处理完,则等待它们结束
while (memstoreRead < e.getWriteNumber()) {
try {
readWaiters.wait(0);
} catch (InterruptedException ie) {
// We were interrupted... finish the loop -- i.e. cleanup --and then
// on our way out, reset the interrupt flag.
interrupted = true;
}
}
}
if (interrupted) Thread.currentThread().interrupt();
}
13.检查memstore的内存大小是否超过memstoreFlushSize,是则请求flush,异步
14.返回结果,如果put操作没拿到行锁,则结果是null