B+树
先说红黑树。红黑树是平衡二叉查找树。它通常用于Map结构的实现,可以根据Key快速找到Value。Java中的TreeMap就是通过红黑树实现的。它是一种二叉树,一个节点最多只能有两个子节点。
当BST的存储结构放在硬盘上时,就出现问题了。因为二叉树的高度比较大,寻找一个叶子节点需要较多次数访问硬盘。我们知道访问硬盘的速度是非常慢的,所以BST不太适合用在硬盘上。然而B+树可以解决这个问题。
B+树针对这种情况做了改良,修改了BST中只有两个子节点的限制,变成了最多有M个节点,M要大于等于3,可以是很大的值如M=1000。这里通过几个例子分别介绍增、删、查操作。你可能听说过二三树,它本质上就是B树(不是B+树)在M=3时的特殊形态。
下面通过一个具体的例子展示B+树大致的模样。例子中M=5。图片通过工具生成:https://www.cs.usfca.edu/~galles/visualization/BPlusTree.html。
第一步:我们往空白的B+树中放入一个key=0,那么跟二叉树一样得到如下结果:
第二步:放入一条数据key=20,由于B+树的一个节点可以放多个值,这个例子中M=5,所以最多可以放5个,得到如下结果:
第三步:放入一条数据key=10,由于B+树的key要保持顺序,所以插到了中间,得到如下结果:
第四步:放入一条数据key=30,得到如下结果:
第五步:放入一条数据key=40,得到如下结果:
由于一个节点放了5条数据,容量已满,需要进行分割。分割的办法就是以中间的key作为分界点,分割成两个节点,并把分界的节点提到上一层。
注意分割之后key=20出现了两次。这两个节点的区别在于叶子节点有value数据,而中间节点没有value数据只有key。分割之后的两个节点之间出现了一个箭头,表示指针,用于遍历树的时候快速跳转到下一个区块。
以上就是B+树插入数据的大概样子。末尾附上笔者实现的B+树代码,可以了解其原理:(依据Wiki的介绍实现:https://en.wikipedia.org/wiki/B%2B_tree)
B+树的数据节点之间像链表一样相连,因此遍历树的操作不需要再去访问中间节点,因此在硬盘上做遍历操作是比较快的。但是数据节点在硬盘上不一定紧挨在一起,有可能相隔很远,因此HBase中提供了优化表OPTIMIZE TABLE
的操作,其原理是将数据重写一遍,让数据能够在硬盘上尽量连续。于是扫描操作就变成了硬盘的连续读取操作,能够提升查询速度。
package btree;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Random;
public class BPlusTree {
public static void main(String[] args) {
Node<String> tree = new LeafNode<String>(5);
List<Integer> list = new ArrayList<Integer>();
int count = 1000000;
for (int i = 0; i < count; i++) {
list.add(i);
}
Random random = new Random(1);
Collections.shuffle(list, random);
for (int e : list) {
tree = tree.insert(e, "str" + random.nextInt(10000000));
}
for (int i = 0; i < count; i++) {
System.out.println(i + ": " + tree.get(i));
}
}
}
abstract class Node<T> {
protected int[] keys;
protected int keyCount;
protected int maxKeys;
public Node(int m) {
this.maxKeys = m;
this.keys = new int[m];
}
protected int locate(int key) {
for (int i = 0; i < keyCount; i++) {
if (key < keys[i]) return i;
}
return keyCount;
}
public static <V> void arrayInsert(int size, V[] array, int pos, V data) {
for (int i = size; i > pos; i--) {
array[i] = array[i - 1];
}
array[pos] = data;
}
public static void arrayInsert(int size, int[] array, int pos, int data) {
for (int i = size; i > pos; i--) {
array[i] = array[i - 1];
}
array[pos] = data;
}
public static <V> V[] arraySlice(V[] array, int start, int end) {
return Arrays.copyOfRange(array, start, end);
}
public static <V> V[] arraySlice(V[] array, int start) {
return arraySlice(array, start, array.length);
}
public static int[] arraySlice(int[] array, int start, int end) {
return Arrays.copyOfRange(array, start, end);
}
public static int[] arraySlice(int[] array, int start) {
return arraySlice(array, start, array.length);
}
protected boolean isFull() {
return keyCount >= maxKeys;
}
public abstract T get(int key);
public abstract Node<T> insert(int key, T value);
}
class InternalNode<T> extends Node<T> {
private Node<T>[] children;
public InternalNode(int maxKeys, int[] keys, Node<T>[] children) {
super(maxKeys);
this.children = new Node[maxKeys + 1];
keyCount = keys.length;
for (int i = 0; i < keyCount; i++) {
this.keys[i] = keys[i];
this.children[i] = children[i];
}
this.children[keyCount] = children[keyCount];
}
@Override
public T get(int key) {
int i = locate(key);
return children[i].get(key);
}
@Override
public InternalNode<T> insert(int key, T value) {
int i = locate(key);
Node<T> newNode = children[i].insert(key, value);
if (newNode == children[i]) return this;
if (newNode.keyCount != 1) throw new RuntimeException("Expect only one key");
InternalNode<T> asInternalNode = (InternalNode<T>) newNode;
arrayInsert(keyCount, keys, i, newNode.keys[0]);
children[i] = asInternalNode.children[0];
arrayInsert(keyCount + 1, children, i + 1, asInternalNode.children[1]);
keyCount++;
if (!isFull()) return this;
return split();
}
private InternalNode<T> split() {
int splitPoint = keyCount / 2;
int[] keys1 = arraySlice(keys, 0, splitPoint);
int middleKey = keys[splitPoint];
int[] keys2 = arraySlice(keys, splitPoint + 1);
Node<T>[] children1 = arraySlice(children, 0, splitPoint + 1);
Node<T>[] children2 = arraySlice(children, splitPoint + 1);
InternalNode<T> node1 = new InternalNode<T>(maxKeys, keys1, children1);
InternalNode<T> node2 = new InternalNode<T>(maxKeys, keys2, children2);
return new InternalNode<T>(maxKeys, new int[]{middleKey}, new Node[]{node1, node2});
}
}
class LeafNode<T> extends Node<T> {
private T[] values;
public LeafNode(int maxKeys) {
super(maxKeys);
this.values = (T[]) new Object[maxKeys];
}
public LeafNode(int maxKeys, int[] keys, T[] values) {
this(maxKeys);
this.keyCount = keys.length;
for (int i = 0; i < keys.length; i++) {
this.keys[i] = keys[i];
this.values[i] = values[i];
}
}
@Override
public T get(int key) {
for (int i = 0; i < keyCount; i++) {
if (keys[i] == key) return values[i];
}
return null;
}
@Override
public Node<T> insert(int key, T value) {
int i = locate(key);
arrayInsert(keyCount, keys, i, key);
arrayInsert(keyCount, values, i, value);
keyCount++;
if (!isFull()) return this;
return split();
}
private InternalNode<T> split() {
int splitPoint = keyCount / 2;
int[] keys1 = arraySlice(keys, 0, splitPoint);
int[] keys2 = arraySlice(keys, splitPoint);
T[] values1 = arraySlice(values, 0, splitPoint);
T[] values2 = arraySlice(values, splitPoint);
LeafNode<T> node1 = new LeafNode<T>(maxKeys, keys1, values1);
LeafNode<T> node2 = new LeafNode<T>(maxKeys, keys2, values2);
return new InternalNode<T>(maxKeys, new int[]{keys2[0]}, new Node[]{node1, node2});
}
}