转载自:http://iwinit.iteye.com/blog/1824881
HBase的批量put操作主要步骤
1.同个region的put视为同一批操作
2.对批量操作按rowkey进行字节排序
- Collections.sort(actionsForRegion);
3.检查region server的全局内存是否超过阀值,如超过则唤醒flush线程进行flush操作
- public void reclaimMemStoreMemory() {
- //如果超过高水位,默认为堆内存的0.4,阻塞rpc线程直到内存减少到预期
- if (isAboveHighWaterMark()) {
- lock.lock();
- try {
- boolean blocked = false;
- long startTime = 0;
- while (isAboveHighWaterMark() && !server.isStopped()) {
- .....
- //给flush线程提交一个task
- wakeupFlushThread();
- try {
- // we should be able to wait forever, but we've seen a bug where
- // we miss a notify, so put a 5 second bound on it at least.
- flushOccurred.await(5, TimeUnit.SECONDS);
- } catch (InterruptedException ie) {
- Thread.currentThread().interrupt();
- }
- }
- ....
- } finally {
- lock.unlock();
- }
- }
- //如果超过低水位,默认为堆内存的0.35,给flush线程提交一个task,不阻塞线程
- else if (isAboveLowWaterMark()) {
- wakeupFlushThread();
- }
- }
4.检查这个region的memstore内存大小是否超过限制,超过则唤醒flush线程对该region进行flush,异步操作
- private void checkResources()
- throws RegionTooBusyException, InterruptedIOException {
- .....
- boolean blocked = false;
- long startTime = 0;
- //当前region内存大小超过blockingMemStoreSize,默认为memstoreFlushSize的2被,memstoreFlushSize默认128M
- while (this.memstoreSize.get() > this.blockingMemStoreSize) {
- //给flush线程发个请求
- requestFlush();
- 。。。。。
- blocked = true;
- //等待一段时间,10s
- synchronized(this) {
- try {
- wait(Math.min(timeToWait, threadWakeFrequency));
- } catch (InterruptedException ie) {
- final long totalTime = EnvironmentEdgeManager.currentTimeMillis() - startTime;
- if (totalTime > 0) {
- this.updatesBlockedMs.add(totalTime);
- }
- LOG.info("Interrupted while waiting to unblock updates for region "
- + this + " '" + Thread.currentThread().getName() + "'");
- InterruptedIOException iie = new InterruptedIOException();
- iie.initCause(ie);
- throw iie;
- }
- }
- }
- ......
- }
5.拿行锁,如果拿不到锁,则不处理
- private Integer internalObtainRowLock(final byte[] row, boolean waitForLock)
- throws IOException {
- //检查row的范围是否在这个region里
- checkRow(row, "row lock");
- startRegionOperation();
- try {
- HashedBytes rowKey = new HashedBytes(row);
- //行锁是一个Latch,释放的时候Latch减1,等待线程就会被唤醒
- CountDownLatch rowLatch = new CountDownLatch(1);
- // loop until we acquire the row lock (unless !waitForLock)
- while (true) {
- //put一把
- CountDownLatch existingLatch = lockedRows.putIfAbsent(rowKey, rowLatch);
- //如果锁不存在,则认为拿到锁
- if (existingLatch == null) {
- break;
- }
- //已经有锁了,则等待锁释放或超时
- else {
- // row already locked
- if (!waitForLock) {
- return null;
- }
- try {
- if (!existingLatch.await(this.rowLockWaitDuration,
- TimeUnit.MILLISECONDS)) {
- throw new IOException("Timed out on getting lock for row="
- + Bytes.toStringBinary(row));
- }
- } catch (InterruptedException ie) {
- // Empty
- }
- }
- }
- // loop until we generate an unused lock id
- //锁id是一个原子递增的整数
- while (true) {
- Integer lockId = lockIdGenerator.incrementAndGet();
- HashedBytes existingRowKey = lockIds.putIfAbsent(lockId, rowKey);
- if (existingRowKey == null) {
- return lockId;
- } else {
- // lockId already in use, jump generator to a new spot
- lockIdGenerator.set(rand.nextInt());
- }
- }
- } finally {
- closeRegionOperation();
- }
- }
7.拿mvcc的写事务id
- public WriteEntry beginMemstoreInsert() {
- synchronized (writeQueue) {
- //事务id是一个原子递增的long
- long nextWriteNumber = ++memstoreWrite;
- //entry用来存这个事务的状态,是否已完成
- WriteEntry e = new WriteEntry(nextWriteNumber);
- writeQueue.add(e);
- return e;
- }
- }
- private long internalAdd(final KeyValue toAdd) {
- //堆内存加了多少
- long s = heapSizeChange(toAdd, this.kvset.add(toAdd));
- timeRangeTracker.includeTimestamp(toAdd);
- this.size.addAndGet(s);
- return s;
- }
- private long append(HRegionInfo info, byte [] tableName, WALEdit edits, UUID clusterId,
- final long now, HTableDescriptor htd, boolean doSync)
- throws IOException {
- ......
- long txid = 0;
- synchronized (this.updateLock) {
- //log的序列号
- long seqNum = obtainSeqNum();
- // The 'lastSeqWritten' map holds the sequence number of the oldest
- // write for each region (i.e. the first edit added to the particular
- // memstore). . When the cache is flushed, the entry for the
- // region being flushed is removed if the sequence number of the flush
- // is greater than or equal to the value in lastSeqWritten.
- // Use encoded name. Its shorter, guaranteed unique and a subset of
- // actual name.
- byte [] encodedRegionName = info.getEncodedNameAsBytes();
- //region第一个修改的事务id,flush时所有大于等于该值的entry都会被写入文件
- this.lastSeqWritten.putIfAbsent(encodedRegionName, seqNum);
- HLogKey logKey = makeKey(encodedRegionName, tableName, seqNum, now, clusterId);
- doWrite(info, logKey, edits, htd);
- this.numEntries.incrementAndGet();
- //事务id,代表第几条log
- txid = this.unflushedEntries.incrementAndGet();
- if (htd.isDeferredLogFlush()) {
- lastDeferredTxid = txid;
- }
- }
- // Sync if catalog region, and if not then check if that table supports
- // deferred log flushing
- if (doSync &&
- (info.isMetaRegion() ||
- !htd.isDeferredLogFlush())) {
- // sync txn to file system
- this.sync(txid);
- }
- return txid;
- }
- 写log的cache
- // appends new writes to the pendingWrites. It is better to keep it in
- // our own queue rather than writing it to the HDFS output stream because
- // HDFSOutputStream.writeChunk is not lightweight at all.
- synchronized void append(Entry e) throws IOException {
- pendingWrites.add(e);
- }
- public void releaseRowLock(final Integer lockId) {
- if (lockId == null) return; // null lock id, do nothing
- //先删除lock id
- HashedBytes rowKey = lockIds.remove(lockId);
- if (rowKey == null) {
- LOG.warn("Release unknown lockId: " + lockId);
- return;
- }
- //再删除lock
- CountDownLatch rowLatch = lockedRows.remove(rowKey);
- if (rowLatch == null) {
- LOG.error("Releases row not locked, lockId: " + lockId + " row: "
- + rowKey);
- return;
- }
- //lock释放
- rowLatch.countDown();
- // sync all transactions upto the specified txid
- private void syncer(long txid) throws IOException {
- Writer tempWriter;
- synchronized (this.updateLock) {
- if (this.closed) return;
- tempWriter = this.writer; // guaranteed non-null
- }
- // if the transaction that we are interested in is already
- // synced, then return immediately.
- //当前flush到第一个日志了,有可能已经被其他rpc线程flush掉了
- if (txid <= this.syncedTillHere) {
- return;
- }
- try {
- long doneUpto;
- long now = System.currentTimeMillis();
- // First flush all the pending writes to HDFS. Then
- // issue the sync to HDFS. If sync is successful, then update
- // syncedTillHere to indicate that transactions till this
- // number has been successfully synced.
- synchronized (flushLock) {
- if (txid <= this.syncedTillHere) {
- return;
- }
- doneUpto = this.unflushedEntries.get();
- /当前所有cache的log
- List<Entry> pending = logSyncerThread.getPendingWrites();
- try {
- //写,但没sync到HDFS
- logSyncerThread.hlogFlush(tempWriter, pending);
- } catch(IOException io) {
- synchronized (this.updateLock) {
- // HBASE-4387, HBASE-5623, retry with updateLock held
- tempWriter = this.writer;
- logSyncerThread.hlogFlush(tempWriter, pending);
- }
- }
- }
- // another thread might have sync'ed avoid double-sync'ing
- if (txid <= this.syncedTillHere) {
- return;
- }
- try {
- /sync到HDFS,写失败重试一次
- tempWriter.sync();
- } catch(IOException io) {
- synchronized (this.updateLock) {
- // HBASE-4387, HBASE-5623, retry with updateLock held
- tempWriter = this.writer;
- tempWriter.sync();
- }
- }
- //当前已sync的日志
- this.syncedTillHere = Math.max(this.syncedTillHere, doneUpto);
- ......
- } catch (IOException e) {
- LOG.fatal("Could not sync. Requesting close of hlog", e);
- //回滚。
- requestLogRoll();
- throw e;
- }
- }
- @Override
- public void append(HLog.Entry entry) throws IOException {
- entry.setCompressionContext(compressionContext);
- try {
- //SequenceFile写入
- this.writer.append(entry.getKey(), entry.getEdit());
- } catch (NullPointerException npe) {
- // Concurrent close...
- throw new IOException(npe);
- }
- }
12.修改mvcc的读事务id
- public void completeMemstoreInsert(WriteEntry e) {
- //递增读事务id
- advanceMemstore(e);
- //等待之前的请求全部完成
- waitForRead(e);
- }
- boolean advanceMemstore(WriteEntry e) {
- synchronized (writeQueue) {
- //事务结束
- e.markCompleted();
- long nextReadValue = -1;
- boolean ranOnce=false;
- //遍历队列,拿到最近已完成的事务id,如果中间有一个请求还未完成,则可能拿到的事务id比当前事务小
- while (!writeQueue.isEmpty()) {
- ranOnce=true;
- WriteEntry queueFirst = writeQueue.getFirst();
- if (nextReadValue > 0) {
- if (nextReadValue+1 != queueFirst.getWriteNumber()) {
- throw new RuntimeException("invariant in completeMemstoreInsert violated, prev: "
- + nextReadValue + " next: " + queueFirst.getWriteNumber());
- }
- }
- if (queueFirst.isCompleted()) {
- nextReadValue = queueFirst.getWriteNumber();
- writeQueue.removeFirst();
- } else {
- break;
- }
- }
- if (!ranOnce) {
- throw new RuntimeException("never was a first");
- }
- //修改读事务的id,所有小于该id的事务都已完成,对read可见
- if (nextReadValue > 0) {
- synchronized (readWaiters) {
- memstoreRead = nextReadValue;
- readWaiters.notifyAll();
- }
- }
- if (memstoreRead >= e.getWriteNumber()) {
- return true;
- }
- return false;
- }
- }
- /**
- * Wait for the global readPoint to advance upto
- * the specified transaction number.
- */
- public void waitForRead(WriteEntry e) {
- boolean interrupted = false;
- synchronized (readWaiters) {
- //如果前面请求还未处理完,则等待它们结束
- while (memstoreRead < e.getWriteNumber()) {
- try {
- readWaiters.wait(0);
- } catch (InterruptedException ie) {
- // We were interrupted... finish the loop -- i.e. cleanup --and then
- // on our way out, reset the interrupt flag.
- interrupted = true;
- }
- }
- }
- if (interrupted) Thread.currentThread().interrupt();
- }
13.检查memstore的内存大小是否超过memstoreFlushSize,是则请求flush,异步
14.返回结果,如果put操作没拿到行锁,则结果是null
http://blog.cloudera.com/blog/2012/06/hbase-write-path/