[HBase]Write Path

最新推荐文章于 2022-01-21 12:25:46 发布

lxf310

最新推荐文章于 2022-01-21 12:25:46 发布

阅读量760

点赞数

分类专栏： HBase架构

HBase架构专栏收录该内容

21 篇文章 0 订阅

订阅专栏

转载自：http://iwinit.iteye.com/blog/1824881

HBase的批量put操作主要步骤

1.同个region的put视为同一批操作

2.对批量操作按rowkey进行字节排序

    Java代码   
    
 Collections.sort(actionsForRegion);

3.检查region server的全局内存是否超过阀值，如超过则唤醒flush线程进行flush操作

    Java代码   
    
  
 public void reclaimMemStoreMemory() {  
     //如果超过高水位，默认为堆内存的0.4，阻塞rpc线程直到内存减少到预期  
     if (isAboveHighWaterMark()) {  
       lock.lock();  
       try {  
         boolean blocked = false;  
         long startTime = 0;  
         while (isAboveHighWaterMark() && !server.isStopped()) {  
           .....  
       //给flush线程提交一个task  
           wakeupFlushThread();  
           try {  
             // we should be able to wait forever, but we've seen a bug where  
             // we miss a notify, so put a 5 second bound on it at least.  
             flushOccurred.await(5, TimeUnit.SECONDS);  
           } catch (InterruptedException ie) {  
             Thread.currentThread().interrupt();  
           }  
         }  
         ....  
       } finally {  
         lock.unlock();  
       }  
     }   
     //如果超过低水位，默认为堆内存的0.35，给flush线程提交一个task，不阻塞线程  
     else if (isAboveLowWaterMark()) {  
       wakeupFlushThread();  
     }  
   }  

4.检查这个region的memstore内存大小是否超过限制，超过则唤醒flush线程对该region进行flush，异步操作

    Java代码   
    
  
  private void checkResources()  
      throws RegionTooBusyException, InterruptedIOException {  
   
   .....  
    boolean blocked = false;  
    long startTime = 0;  
    //当前region内存大小超过blockingMemStoreSize，默认为memstoreFlushSize的2被，memstoreFlushSize默认128M  
    while (this.memstoreSize.get() > this.blockingMemStoreSize) {  
 //给flush线程发个请求  
      requestFlush();  
     。。。。。  
      blocked = true;  
 //等待一段时间，10s  
      synchronized(this) {  
        try {  
          wait(Math.min(timeToWait, threadWakeFrequency));  
        } catch (InterruptedException ie) {  
          final long totalTime = EnvironmentEdgeManager.currentTimeMillis() - startTime;  
          if (totalTime > 0) {  
            this.updatesBlockedMs.add(totalTime);  
          }  
          LOG.info("Interrupted while waiting to unblock updates for region "  
            + this + " '" + Thread.currentThread().getName() + "'");  
          InterruptedIOException iie = new InterruptedIOException();  
          iie.initCause(ie);  
          throw iie;  
        }  
      }  
    }  
 ......  
  }  

5.拿行锁，如果拿不到锁，则不处理

    Java代码   
    
  
  private Integer internalObtainRowLock(final byte[] row, boolean waitForLock)  
      throws IOException {  
 //检查row的范围是否在这个region里  
    checkRow(row, "row lock");  
    startRegionOperation();  
    try {  
      HashedBytes rowKey = new HashedBytes(row);  
      //行锁是一个Latch，释放的时候Latch减1，等待线程就会被唤醒  
      CountDownLatch rowLatch = new CountDownLatch(1);  
   
      // loop until we acquire the row lock (unless !waitForLock)  
      while (true) {  
 //put一把  
        CountDownLatch existingLatch = lockedRows.putIfAbsent(rowKey, rowLatch);  
 //如果锁不存在，则认为拿到锁  
        if (existingLatch == null) {  
          break;  
        }   
 //已经有锁了，则等待锁释放或超时  
 else {  
          // row already locked  
          if (!waitForLock) {  
            return null;  
          }  
          try {  
            if (!existingLatch.await(this.rowLockWaitDuration,  
                            TimeUnit.MILLISECONDS)) {  
              throw new IOException("Timed out on getting lock for row="  
                  + Bytes.toStringBinary(row));  
            }  
          } catch (InterruptedException ie) {  
            // Empty  
          }  
        }  
      }  
   
      // loop until we generate an unused lock id  
 //锁id是一个原子递增的整数  
      while (true) {  
        Integer lockId = lockIdGenerator.incrementAndGet();  
        HashedBytes existingRowKey = lockIds.putIfAbsent(lockId, rowKey);  
        if (existingRowKey == null) {  
          return lockId;  
        } else {  
          // lockId already in use, jump generator to a new spot  
          lockIdGenerator.set(rand.nextInt());  
        }  
      }  
    } finally {  
      closeRegionOperation();  
    }  
  }  

6.修改KeyValue的timestamp为当前时间

7.拿mvcc的写事务id

    Java代码   
    
  
    public WriteEntry beginMemstoreInsert() {  
    synchronized (writeQueue) {  
 //事务id是一个原子递增的long  
      long nextWriteNumber = ++memstoreWrite;  
        //entry用来存这个事务的状态，是否已完成  
      WriteEntry e = new WriteEntry(nextWriteNumber);  
      writeQueue.add(e);  
      return e;  
    }  
  }  

8.写入memstore的内存kv列表

    Java代码   
    
  
   private long internalAdd(final KeyValue toAdd) {  
   //堆内存加了多少  
   long s = heapSizeChange(toAdd, this.kvset.add(toAdd));  
   timeRangeTracker.includeTimestamp(toAdd);  
   this.size.addAndGet(s);  
   return s;  
 }  

9.写Hlog，但不flush，仍在内存

    Java代码   
    
  
    private long append(HRegionInfo info, byte [] tableName, WALEdit edits, UUID clusterId,  
      final long now, HTableDescriptor htd, boolean doSync)  
    throws IOException {  
      ......  
      long txid = 0;  
      synchronized (this.updateLock) {  
 //log的序列号  
        long seqNum = obtainSeqNum();  
        // The 'lastSeqWritten' map holds the sequence number of the oldest  
        // write for each region (i.e. the first edit added to the particular  
        // memstore). . When the cache is flushed, the entry for the  
        // region being flushed is removed if the sequence number of the flush  
        // is greater than or equal to the value in lastSeqWritten.  
        // Use encoded name.  Its shorter, guaranteed unique and a subset of  
        // actual  name.  
        byte [] encodedRegionName = info.getEncodedNameAsBytes();  
 //region第一个修改的事务id，flush时所有大于等于该值的entry都会被写入文件  
        this.lastSeqWritten.putIfAbsent(encodedRegionName, seqNum);  
        HLogKey logKey = makeKey(encodedRegionName, tableName, seqNum, now, clusterId);  
        doWrite(info, logKey, edits, htd);  
        this.numEntries.incrementAndGet();  
 //事务id，代表第几条log  
        txid = this.unflushedEntries.incrementAndGet();  
        if (htd.isDeferredLogFlush()) {  
          lastDeferredTxid = txid;  
        }  
      }  
      // Sync if catalog region, and if not then check if that table supports  
      // deferred log flushing  
      if (doSync &&   
          (info.isMetaRegion() ||  
          !htd.isDeferredLogFlush())) {  
        // sync txn to file system  
        this.sync(txid);  
      }  
      return txid;  
    }  

    Java代码   
    
  
 写log的cache  
 // appends new writes to the pendingWrites. It is better to keep it in  
 // our own queue rather than writing it to the HDFS output stream because  
 // HDFSOutputStream.writeChunk is not lightweight at all.  
 synchronized void append(Entry e) throws IOException {  
   pendingWrites.add(e);  
 }  

10.释放行锁

    Java代码   
    
  
  public void releaseRowLock(final Integer lockId) {  
 if (lockId == null) return; // null lock id, do nothing  
 //先删除lock id  
 HashedBytes rowKey = lockIds.remove(lockId);  
 if (rowKey == null) {  
   LOG.warn("Release unknown lockId: " + lockId);  
   return;  
 }  
 //再删除lock  
 CountDownLatch rowLatch = lockedRows.remove(rowKey);  
 if (rowLatch == null) {  
   LOG.error("Releases row not locked, lockId: " + lockId + " row: "  
       + rowKey);  
   return;  
 }  
 //lock释放  
 rowLatch.countDown();  

11.flush Hlog到HDFS

    Java代码   
    
  
 // sync all transactions upto the specified txid  
 private void syncer(long txid) throws IOException {  
   Writer tempWriter;  
   synchronized (this.updateLock) {  
     if (this.closed) return;  
     tempWriter = this.writer; // guaranteed non-null  
   }  
   // if the transaction that we are interested in is already   
   // synced, then return immediately.  
   //当前flush到第一个日志了，有可能已经被其他rpc线程flush掉了  
   if (txid <= this.syncedTillHere) {  
     return;  
   }  
   try {  
     long doneUpto;  
     long now = System.currentTimeMillis();  
     // First flush all the pending writes to HDFS. Then   
     // issue the sync to HDFS. If sync is successful, then update  
     // syncedTillHere to indicate that transactions till this  
     // number has been successfully synced.  
     synchronized (flushLock) {  
       if (txid <= this.syncedTillHere) {  
         return;  
       }  
       doneUpto = this.unflushedEntries.get();  
 /当前所有cache的log  
       List<Entry> pending = logSyncerThread.getPendingWrites();  
       try {  
 //写，但没sync到HDFS  
         logSyncerThread.hlogFlush(tempWriter, pending);  
       } catch(IOException io) {  
         synchronized (this.updateLock) {  
           // HBASE-4387, HBASE-5623, retry with updateLock held  
           tempWriter = this.writer;  
           logSyncerThread.hlogFlush(tempWriter, pending);  
         }  
       }  
     }  
     // another thread might have sync'ed avoid double-sync'ing  
     if (txid <= this.syncedTillHere) {  
       return;  
     }  
     try {  
 /sync到HDFS，写失败重试一次  
       tempWriter.sync();  
     } catch(IOException io) {  
       synchronized (this.updateLock) {  
         // HBASE-4387, HBASE-5623, retry with updateLock held  
         tempWriter = this.writer;  
         tempWriter.sync();  
       }  
     }  
     //当前已sync的日志  
     this.syncedTillHere = Math.max(this.syncedTillHere, doneUpto);  
   
    ......  
   } catch (IOException e) {  
     LOG.fatal("Could not sync. Requesting close of hlog", e);  
     //回滚。  
     requestLogRoll();  
     throw e;  
   }  
 }  

    Java代码   
    
  
 @Override  
  public void append(HLog.Entry entry) throws IOException {  
    entry.setCompressionContext(compressionContext);  
    try {  
      //SequenceFile写入  
      this.writer.append(entry.getKey(), entry.getEdit());  
    } catch (NullPointerException npe) {  
      // Concurrent close...  
      throw new IOException(npe);  
    }  
  }  

12.修改mvcc的读事务id

    Java代码   
    
  
 public void completeMemstoreInsert(WriteEntry e) {  
   //递增读事务id  
   advanceMemstore(e);  
   //等待之前的请求全部完成  
   waitForRead(e);  
 }  

    Java代码   
    
  
   boolean advanceMemstore(WriteEntry e) {  
    synchronized (writeQueue) {  
 //事务结束  
      e.markCompleted();  
   
      long nextReadValue = -1;  
      boolean ranOnce=false;  
      //遍历队列，拿到最近已完成的事务id，如果中间有一个请求还未完成，则可能拿到的事务id比当前事务小  
      while (!writeQueue.isEmpty()) {  
        ranOnce=true;  
        WriteEntry queueFirst = writeQueue.getFirst();  
   
        if (nextReadValue > 0) {  
          if (nextReadValue+1 != queueFirst.getWriteNumber()) {  
            throw new RuntimeException("invariant in completeMemstoreInsert violated, prev: "  
                + nextReadValue + " next: " + queueFirst.getWriteNumber());  
          }  
        }  
   
        if (queueFirst.isCompleted()) {  
          nextReadValue = queueFirst.getWriteNumber();  
          writeQueue.removeFirst();  
        } else {  
          break;  
        }  
      }  
   
      if (!ranOnce) {  
        throw new RuntimeException("never was a first");  
      }  
   
 //修改读事务的id，所有小于该id的事务都已完成，对read可见  
      if (nextReadValue > 0) {  
        synchronized (readWaiters) {  
          memstoreRead = nextReadValue;  
          readWaiters.notifyAll();  
        }  
      }  
      if (memstoreRead >= e.getWriteNumber()) {  
        return true;  
      }  
      return false;  
    }  
  }  

    Java代码   
    
  
 /** 
  * Wait for the global readPoint to advance upto 
  * the specified transaction number. 
  */  
 public void waitForRead(WriteEntry e) {  
   boolean interrupted = false;  
   synchronized (readWaiters) {  
     //如果前面请求还未处理完，则等待它们结束  
     while (memstoreRead < e.getWriteNumber()) {  
       try {  
         readWaiters.wait(0);  
       } catch (InterruptedException ie) {  
         // We were interrupted... finish the loop -- i.e. cleanup --and then  
         // on our way out, reset the interrupt flag.  
         interrupted = true;  
       }  
     }  
   }  
   if (interrupted) Thread.currentThread().interrupt();  
 }