一涉及到persistent, 哪怕只是最基本的需求,很多人都会依赖数据库,或是其他现成的库或工具。确实,对于文件,大部分人很少直接打交道,或者只是诸如整体反序列化/序列化、按行读取、append new line等有限的操作。
一个persistent store最基本的问题是如何组织数据,也就是access method, 大致有:
1)队列(定长记录 or 不定长记录): kafka等消息队列,受限的访问模式,读写都很快,
2)B+ 树 and variants:ntfs, ext2, DBMS等。读写性能综合,且支持key的比较、locality of reference,cache 友好
3) Hash:BDB,DBMS等。缺点是不支持key的比较、locality of reference。
4) LSM:levelDB,HBase, bigTable等。 快速写,支持比较、 locality of reference
下面是一个基于Hash的(拉链法处理冲突)、任意长度key、任意长度value的 key value store,key 和 value都是 byte[]。
import java.io.*;
import java.util.Arrays;
public class HashStore {
private RandomAccessFile rf;
private long tableSize = 1024L * 1024 * 1024;
public HashStore(String file) throws IOException {
File f = new File(file);
if (!f.exists()) {
f.createNewFile();
}
rf = new RandomAccessFile(file, "rw");
if (rf.length() == 0) {
rf.setLength(tableSize * 8);
}
}
public void put(byte[] k, byte[] v) throws IOException {
long i = (Math.abs(Arrays.hashCode(k)) % tableSize) * 8;
rf.seek(i);
byte[] key = new byte[k.length];
boolean removeFlag = false;
for (long p = rf.readLong(); p != 0; ) {
rf.seek(p);
int keyLen = rf.readInt();
if (keyLen == k.length) {
rf.read(key);
int valueLen = rf.readInt();
if (Arrays.equals(key, k)) {
if (valueLen == v.length) { // new value is of the same size, reuse the space
rf.write(v);
return;
}
else { // allocate new space, remove the old item (not physically)
removeFlag = true;
break;
}
}
else {
rf.seek(rf.getFilePointer() + valueLen);
p = rf.readLong();
}
}
else {
rf.seek(rf.getFilePointer() + keyLen);
int valueLen = rf.readInt();
rf.seek(rf.getFilePointer() + valueLen);
p = rf.readLong();
}
}
if (removeFlag) {
remove(k);
}
rf.seek(i);
long head = rf.readLong();
long pos = rf.length();
// insert the new item
rf.seek(pos);
rf.writeInt(k.length);
rf.write(k);
rf.writeInt(v.length);
rf.write(v);
rf.writeLong(head);
rf.seek(i);
rf.writeLong(pos);
}
public byte[] get(byte[] k) throws IOException {
long i = Math.abs(Arrays.hashCode(k)) % tableSize * 8;
rf.seek(i);
byte[] key = new byte[k.length];
for (long p = rf.readLong(); p != 0; ) {
rf.seek(p);
int keyLen = rf.readInt();
if (keyLen == k.length) {
rf.read(key);
int valueLen = rf.readInt();
if (Arrays.equals(key, k)) {
byte[] v = new byte[valueLen];
rf.read(v);
return v;
}
else {
rf.seek(rf.getFilePointer() + valueLen);
p = rf.readLong();
}
}
else {
rf.seek(rf.getFilePointer() + keyLen);
int valueLen = rf.readInt();
rf.seek(rf.getFilePointer() + valueLen);
p = rf.readLong();
}
}
return null;
}
public void remove(byte[] k) throws IOException{
long i = Math.abs(Arrays.hashCode(k)) % tableSize * 8;
rf.seek(i);
byte[] key = new byte[k.length];
for (long p = rf.readLong(), pre = i; p != 0; ) {
rf.seek(p);
int keyLen = rf.readInt();
if (keyLen == k.length) {
rf.read(key);
int valueLen = rf.readInt();
if (Arrays.equals(key, k)) {
rf.seek(rf.getFilePointer() + valueLen);
long next = rf.readLong();
rf.seek(pre);
rf.writeLong(next);
return;
}
else {
rf.seek(rf.getFilePointer() + valueLen);
p = rf.readLong();
pre = rf.getFilePointer() - 8;
}
}
else {
rf.seek(rf.getFilePointer() + keyLen);
int valueLen = rf.readInt();
rf.seek(rf.getFilePointer() + valueLen);
p = rf.readLong();
pre = rf.getFilePointer() - 8;
}
}
}
public void close() throws IOException {
rf.close();
}
public void manageFragment() {
// garbage collection, compact the data file
}
}
put 一百万个item,再每个get一次,用时40秒,相同的操作在Sql server 上(两个字段的表(key, value), key 为主键) 写一百万个key-value, 再每个用主键取一次,竟然结束不了,主要是一个一个select比较慢。
import static org.junit.Assert.*;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Random;
import org.junit.Test;
public class HashStoreTest {
@Test
public void test() throws IOException {
HashStore hs = new HashStore("c:\\hashStore.txt");
int n = 1000000;
Integer[] a = new Integer[n];
for (int i = 0; i < n; ++i) {
a[i] = i + 1;
}
shuffle(a);
long start = System.currentTimeMillis();
for (int j = 0; j < n; ++j) {
Integer i = a[j];
String k = i.toString();
String v = new Integer(-i).toString();
hs.put(k.getBytes(), v.getBytes());
}
for (int j = 0; j < n; ++j) {
Integer i = a[j];
assertEquals(new String(hs.get(i.toString().getBytes())), new Integer(-i).toString());
}
System.out.println(System.currentTimeMillis() - start);
}
private void shuffle(Integer[] a) {
if (a == null) return;
Random r = new Random();
for (int i = 0; i < a.length; ++i) {
int j = r.nextInt(i + 1);
Integer tmp = a[i];
a[i] = a[j];
a[j] = a[i];
}
}
}