转载自:http://blackproof.iteye.com/blog/2037159
HBase region split源码分析
一、流程概述
1.HBaseAdmin 发起 hbase split
2.HRegionServer 确定分割点 region split point
3.CompactSplitThread和SplitRequest 进行region分割
3.1SplitTransaction st.prepare()初始化两个子region
3.2splitTransaction execute执行分割
3.2.1两个子region DaughterOpener线程 start
3.2.2若region 需要compact,进行compact路程
3.2.3HRegionServer添加子region到meta表,加入到RegionServer里
3.3修改zk节点状态,等待split结束
二 、hbase region split UML图
三、详细分析
1.HBaseAdmin 发起 hbase split
- public void split(final byte [] tableNameOrRegionName,
- final byte [] splitPoint) throws IOException, InterruptedException {
- CatalogTracker ct = getCatalogTracker();
- try {
- Pair<HRegionInfo, ServerName> regionServerPair
- = getRegion(tableNameOrRegionName, ct);//获得HRI,若是但region
- if (regionServerPair != null) {
- if (regionServerPair.getSecond() == null) {
- throw new NoServerForRegionException(Bytes.toStringBinary(tableNameOrRegionName));
- } else {
- //split region 重点分析方法
- split(regionServerPair.getSecond(), regionServerPair.getFirst(), splitPoint);
- }
- } else {
- //table split流程
- final String tableName = tableNameString(tableNameOrRegionName, ct);
- List<Pair<HRegionInfo, ServerName>> pairs =
- MetaReader.getTableRegionsAndLocations(ct,
- tableName);
- for (Pair<HRegionInfo, ServerName> pair: pairs) {
- // May not be a server for a particular row
- if (pair.getSecond() == null) continue;
- HRegionInfo r = pair.getFirst();
- // check for parents
- if (r.isSplitParent()) continue;
- // if a split point given, only split that particular region
- if (splitPoint != null && !r.containsRow(splitPoint)) continue;
- // call out to region server to do split now
- split(pair.getSecond(), pair.getFirst(), splitPoint);
- }
- }
- } finally {
- cleanupCatalogTracker(ct);
- }
- }
2.HRegionServer 确定分割点 region split point
- @Override
- public void splitRegion(HRegionInfo regionInfo, byte[] splitPoint)
- throws NotServingRegionException, IOException {
- checkOpen();//检查server和hdfs是否可用
- HRegion region = getRegion(regionInfo.getRegionName());//根据HRI获取region
- region.flushcache();//flush cache 有几种情况不进行flush
- //the cache is empte | the region is closed.| a flush is already in progress | writes are disabled
- region.forceSplit(splitPoint);//设置split point
- compactSplitThread.requestSplit(region, region.checkSplit());//获取split point,进行split
- }
- protected byte[] getSplitPoint() {
- byte[] explicitSplitPoint = this.region.getExplicitSplitPoint();
- if (explicitSplitPoint != null) {
- return explicitSplitPoint;
- }
- Map<byte[], Store> stores = region.getStores();
- byte[] splitPointFromLargestStore = null;
- long largestStoreSize = 0;
- for (Store s : stores.values()) {
- byte[] splitPoint = s.getSplitPoint();
- long storeSize = s.getSize();
- if (splitPoint != null && largestStoreSize < storeSize) {//获得最大store
- splitPointFromLargestStore = splitPoint;
- largestStoreSize = storeSize;
- }
- }
- return splitPointFromLargestStore;
- }
3.CompactSplitThread和SplitRequest 进行region分割
这里是split中较为复杂的过程
- public void run() {
- if (this.server.isStopping() || this.server.isStopped()) {
- LOG.debug("Skipping split because server is stopping=" +
- this.server.isStopping() + " or stopped=" + this.server.isStopped());
- return;
- }
- try {
- final long startTime = System.currentTimeMillis();
- SplitTransaction st = new SplitTransaction(parent, midKey);
- // If prepare does not return true, for some reason -- logged inside in
- // the prepare call -- we are not ready to split just now. Just return.
- //<strong> 3.1SplitTransaction st.prepare()初始化两个子region</strong>
- if (!st.prepare()) return;
- try {
- st.execute(this.server, this.server);//<strong>3.2splitTransaction execute执行分割</strong>
- this.server.getMetrics().incrementSplitSuccessCount();
- } catch (Exception e) {
- 。。。。。。。。。。。。
3.2splitTransaction execute执行分割
- public PairOfSameType<HRegion> execute(final Server server,
- final RegionServerServices services)
- throws IOException {
- PairOfSameType<HRegion> regions = createDaughters(server, services);
- //创建split临时目录,改变region zk状态,关闭region,停止所有store服务
- //创建daughter目录,将region storefile放入目录中
- //创建子region A、B,在zk上注册,并且设置原HRI下线
- openDaughters(server, services, regions.getFirst(), regions.getSecond());
- transitionZKNode(server, services, regions.getFirst(), regions.getSecond());
- return regions;
- }
3.2.1两个子region DaughterOpener线程 start
- final RegionServerServices services, HRegion a, HRegion b)
- throws IOException {
- boolean stopped = server != null && server.isStopped();
- boolean stopping = services != null && services.isStopping();
- // TODO: Is this check needed here?
- if (stopped || stopping) {
- LOG.info("Not opening daughters " +
- b.getRegionInfo().getRegionNameAsString() +
- " and " +
- a.getRegionInfo().getRegionNameAsString() +
- " because stopping=" + stopping + ", stopped=" + stopped);
- } else {
- // Open daughters in parallel.创建两个字region打开操作类
- DaughterOpener aOpener = new DaughterOpener(server, a);
- DaughterOpener bOpener = new DaughterOpener(server, b);
- aOpener.start();
- bOpener.start();
- try {
- aOpener.join();
- bOpener.join();
- } catch (InterruptedException e) {
- Thread.currentThread().interrupt();
- throw new IOException("Interrupted " + e.getMessage());
- }
- if (aOpener.getException() != null) {
- throw new IOException("Failed " +
- aOpener.getName(), aOpener.getException());
- }
- if (bOpener.getException() != null) {
- throw new IOException("Failed " +
- bOpener.getName(), bOpener.getException());
- }
- if (services != null) {
- try {
- // add 2nd daughter first (see HBASE-4335)
- services.postOpenDeployTasks(b, server.getCatalogTracker(), true);
- // Should add it to OnlineRegions
- services.addToOnlineRegions(b);
- services.postOpenDeployTasks(a, server.getCatalogTracker(), true);
- services.addToOnlineRegions(a);
- } catch (KeeperException ke) {
- throw new IOException(ke);
- }
- }
- }
调用HRegion 打开方法openHRegion
- protected HRegion openHRegion(final CancelableProgressable reporter)
- throws IOException {
- checkCompressionCodecs();
- long seqid = initialize(reporter);
- //初始化region,
- //1.checkRegionInfoOnFilesystem将HRegionInfo写入文件
- //2.cleanupTempDir 清空老region临时目录
- //3.初始化HRegion store,加载hfile
- //4.获得recover.edit文件,找到对应的store,将读取的keyvalue输出到store,恢复hregion
- if (this.log != null) {
- this.log.setSequenceNumber(seqid);
- }
- return this;
- }
3.2.2若region 需要compact,进行compact过程
compact过程有点复杂,过程如下:
1.将所有storefile放入compact候选者
2.交给coprocessor做处理,选择compact storefile
3.若coprocessor没有做处理,则采用系统算法选择
3.1必须要进行compact的文件,文件大小大于compact最大值并且没有其他被引用
3.2必须要进行compact文件小于compact文件最小数
3.3 isMajorCompaction判断是否需要major compact
3.3.1当ttl大于storefile中最大文件compact time,则不需要
3.3.2 以上反之,需要
3.3.3 最后一次major compaction时间大于majorCompactionTime,需要
3.4 当compact文件大于compact文件最大数,且需要major compaction活强制major compaction,则进行major compaction
3.5或则进行minor compact,他两个的区别在于一个compact文件数是所有,一个compact文件数不大于maxcompactfile配置
- public CompactionRequest requestCompaction(int priority) throws IOException {
- // don't even select for compaction if writes are disabled
- if (!this.region.areWritesEnabled()) {
- return null;
- }
- CompactionRequest ret = null;
- this.lock.readLock().lock();
- try {
- synchronized (filesCompacting) {
- // candidates = all storefiles not already in compaction queue
- List<StoreFile> candidates = Lists.newArrayList(storefiles);
- if (!filesCompacting.isEmpty()) {
- // exclude all files older than the newest file we're currently
- // compacting. this allows us to preserve contiguity (HBASE-2856)
- StoreFile last = filesCompacting.get(filesCompacting.size() - 1);
- int idx = candidates.indexOf(last);
- Preconditions.checkArgument(idx != -1);
- candidates.subList(0, idx + 1).clear();
- }
- boolean override = false;
- if (region.getCoprocessorHost() != null) {
- override = region.getCoprocessorHost().preCompactSelection(
- this, candidates);
- }
- CompactSelection filesToCompact;
- if (override) {
- // coprocessor is overriding normal file selection
- filesToCompact = new CompactSelection(conf, candidates);
- } else {
- filesToCompact = compactSelection(candidates, priority);
- }
- if (region.getCoprocessorHost() != null) {
- region.getCoprocessorHost().postCompactSelection(this,
- ImmutableList.copyOf(filesToCompact.getFilesToCompact()));
- }
- // no files to compact
- if (filesToCompact.getFilesToCompact().isEmpty()) {
- return null;
- }
- // basic sanity check: do not try to compact the same StoreFile twice.
- if (!Collections.disjoint(filesCompacting, filesToCompact.getFilesToCompact())) {
- // TODO: change this from an IAE to LOG.error after sufficient testing
- Preconditions.checkArgument(false, "%s overlaps with %s",
- filesToCompact, filesCompacting);
- }
- filesCompacting.addAll(filesToCompact.getFilesToCompact());
- Collections.sort(filesCompacting, StoreFile.Comparators.FLUSH_TIME);
- // major compaction iff all StoreFiles are included
- boolean isMajor = (filesToCompact.getFilesToCompact().size() == this.storefiles.size());
- if (isMajor) {
- // since we're enqueuing a major, update the compaction wait interval
- this.forceMajor = false;
- }
- // everything went better than expected. create a compaction request
- int pri = getCompactPriority(priority);
- ret = new CompactionRequest(region, this, filesToCompact, isMajor, pri);
- }
- } finally {
- this.lock.readLock().unlock();
- }
- if (ret != null) {
- CompactionRequest.preRequest(ret);
- }
- return ret;
- }
3.2.3HRegionServer添加子region到meta表,加入到RegionServer里
更新meta表
- // If daughter of a split, update whole row, not just location.更新meta表 loaction和rowkey
- MetaEditor.addDaughter(ct, r.getRegionInfo(),
- this.serverNameFromMasterPOV);
加入regionserver
- public void addToOnlineRegions(HRegion region) {
- this.onlineRegions.put(region.getRegionInfo().getEncodedName(), region);
- }
3.3修改zk节点状态,等待split结束
- /* package */void transitionZKNode(final Server server,
- final RegionServerServices services, HRegion a, HRegion b)
- throws IOException {
- // Tell master about split by updating zk. If we fail, abort.
- if (server != null && server.getZooKeeper() != null) {
- try {
- this.znodeVersion = transitionNodeSplit(server.getZooKeeper(),
- parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(),
- server.getServerName(), this.znodeVersion);
- int spins = 0;
- // Now wait for the master to process the split. We know it's done
- // when the znode is deleted. The reason we keep tickling the znode is
- // that it's possible for the master to miss an event.
- do {
- if (spins % 10 == 0) {
- LOG.debug("Still waiting on the master to process the split for " +
- this.parent.getRegionInfo().getEncodedName());
- }
- Thread.sleep(100);
- // When this returns -1 it means the znode doesn't exist
- this.znodeVersion = tickleNodeSplit(server.getZooKeeper(),
- parent.getRegionInfo(), a.getRegionInfo(), b.getRegionInfo(),
- server.getServerName(), this.znodeVersion);
- spins++;
- } while (this.znodeVersion != -1 && !server.isStopped()
- && !services.isStopping());
结束了,有时间再看看compact过程,其实在split中已经包含compact的过程,不知道是不是所有的compact流程都一样