项目需要实现来电归属地查询,所以就找到了下面这个文章
原理主要在(一)和(二)中,作者的数据压缩思路很给力,将6M的原始文本数据压缩到400kb左右,原作者已经将分析讲的很清楚了,这里提炼一下要点,并将c++实现转化成java实现。
1、压缩主要通过减小数据冗余完成;
2、原始数据格式:手机号码前7位,城市名
3、数据冗余点,手机号码有递增规律、城市重复排列
4、将手机号码的排列转换为号码区间,城市名建索引
5、号码区间用2个short,城市索引用1个short
6、为了查询的效率,原始数据按号码递增排列,这样查询的时候能够用二分快速查找
下面是原文中的一张图,可以更好的理解数据的存储结构
转换实现
private static void convertTXTtoDAT() {
File readFile = new File(CONVERT_TXT_NAME);
if (!readFile.exists()) {
System.out.println(CONVERT_TXT_NAME + " not exist");
return;
}
File writeFile = new File(OUTPUT_FILE_NAME);
try {
writeFile.createNewFile();
} catch (IOException e) {
e.printStackTrace();
}
if (!writeFile.exists()) {
System.out.println(OUTPUT_FILE_NAME + " not create!");
return;
}
CityCollector cityCollector = new CityCollector();
BufferedReader reader = null;
RandomAccessFile writer = null;
try {
reader = new BufferedReader(new FileReader(readFile));
writer = new RandomAccessFile(writeFile, "rw");
int count = 0;
writer.writeInt(count);
String data = reader.readLine();
String[] content = data.split(",");
int number = Integer.parseInt(content[0]);
String cityName = content[1];
int cityIndex = cityCollector.putCity(cityName);
NumberCompressor compressor = new NumberCompressor(number,
cityIndex);
while ((data = reader.readLine()) != null) {
content = data.split(",");
if (content.length != 2) {
continue;
}
number = Integer.parseInt(content[0]);
cityName = content[1];
cityIndex = cityCollector.putCity(cityName);
if (cityIndex == compressor.getCityIndex()
&& number - compressor.getCurrentNumber() == 1) {
compressor.increaseSkipNum();
} else {
writer.writeShort(compressor.getStartNum());
writer.writeShort(compressor.getAfterNum());
writer.writeShort(compressor.getCityIndex());
++count;
compressor = new NumberCompressor(number, cityIndex);
}
}
writer.writeShort(compressor.getStartNum());
writer.writeShort(compressor.getAfterNum());
writer.writeShort(cityIndex);
++count;
writer.seek(0);
writer.writeInt(count);
writer.seek(writer.length());
writer.write(cityCollector.getFormatCityByte());
} catch (IOException e) {
} finally {
if (reader != null) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (writer != null) {
try {
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}其中CityCollector是存储城市信息的工具类,按顺序存储城市名,并返回其索引,最后和号码数据一起存入文件
public class CityCollector {
/**
* 将城市存储长度固定,便于查找时的快速定位,可以按需要修改
*/
public static final int MAXCITYLENGTH = 34;
private ArrayList mCityList = new ArrayList();
public CityCollector() {
}
public byte[] getFormatCityByte() {
ByteBuffer buffer = ByteBuffer.allocate(mCityList.size() * MAXCITYLENGTH);
int size = mCityList.size();
for (int i = 0; i < size; ++i) {
buffer.position(MAXCITYLENGTH * i);
buffer.put(mCityList.get(i).getBytes());
}
return buffer.array();
}
public int putCity(String city) {
int cityIndex = mCityList.indexOf(city);
if (cityIndex != -1) {
return cityIndex;
} else {
mCityList.add(city);
return mCityList.size() - 1;
}
}
}NumberCompressor是为实现号码压缩存储的一个结构,包括一个号码区间的起始号,该区间的号码数量,和区间的城市索引(一个区间内的号码在同一个城市)
public class NumberCompressor {
private int mBeginNum;
private int mSkipNum = 0;
private int mCityIndex;
public NumberCompressor(int number, int cityIndex) {
mBeginNum = number;
mCityIndex = cityIndex;
}
public void increaseSkipNum() {
++mSkipNum;
}
public int getCurrentNumber() {
return mBeginNum + mSkipNum;
}
public int getCityIndex() {
return mCityIndex;
}
public int getStartNum() {
return mBeginNum / 100;
}
public int getAfterNum() {
return mBeginNum % 100 + mSkipNum * 100;
}
}查找就是存储的逆向过程,由于数据是有序存储的,所以这里查找用二分实现,如下
private static String searchNumberLocation(int number) {
File file = new File(OUTPUT_FILE_NAME);
RandomAccessFile readFile = null;
String result = null;
try {
readFile = new RandomAccessFile(file, "r");
int count = readFile.readInt();
int left = 0;
int right = count - 1;
int blockSize = 3 * 2; // 3 * short
while (left <= right) {
int middle = (left + right) / 2;
readFile.seek(4 + middle * blockSize); // 4 is the count(int)
// size
int firstNum = readFile.readShort();
int secondNum = readFile.readShort();
int beginNum = firstNum * 100 + secondNum % 100;
int endNum = beginNum + secondNum / 100;
if (number < beginNum) {
right = middle - 1;
} else if (number > endNum) {
left = middle + 1;
} else {
int cityIndex = readFile.readShort();
readFile.seek(4 + count * blockSize + cityIndex
* CityCollector.MAXCITYLENGTH);
byte[] b = new byte[CityCollector.MAXCITYLENGTH];
readFile.read(b);
result = new String(b).trim();
break;
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
if (readFile != null) {
try {
readFile.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return result;
}