花了差不多一天半的时间终于把一颗三叉树看完了,不过对于里面还有点疑惑,下面在代码里注释上了自己的理解,里面还存在一些疑问,欢迎理解的朋友们指出其中的错误,以及解答里面的疑问。
package org.apache.spell;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Stack;
import java.util.StringTokenizer;
import java.util.Vector;
import java.util.zip.GZIPInputStream;
/**
* 三叉搜索树实现:字符串排序数据结构
* Implementation of a Ternary Search Trie, a data structure for storing <code>String</code> objects
* that combines the compact size of a binary search tree with the speed of a digital search trie, and is
* therefore ideal for practical use in sorting and searching data.</p> <p>
*
* This data structure is faster than hashing for many typical search problems, and supports
* a broader range of useful problems and operations. Ternary searches are faster than
* hashing and more powerful, too.</p> <p>
*
* The theory of ternary search trees was described at a symposium in 1997 (see "Fast
* Algorithms for Sorting and Searching Strings," by J.L. Bentley and R. Sedgewick,
* Proceedings of the 8th Annual ACM-SIAM Symposium on Discrete Algorithms, January 1997).
* Algorithms in C, Third Edition, by Robert Sedgewick (Addison-Wesley, 1998) provides
* yet another view of ternary search trees.
*
* @author Bruno Martins
*
*/
public class TernarySearchTrie {
public static void main(String[] args) throws Exception {
String triefile = "E://Java Projects//ses//src//test//lucene//dic//spell//trie.txt";
TernarySearchTrie dictionary = new TernarySearchTrie( new File(triefile));
//benchMark(args[0]);
//benchMark(args[1]);
}
// 三叉树的创建,节点创建,删除等
/**
* An inner class of Ternary Search Trie that represents a node in the trie.
* 三叉树内部类代表树中的一个节点
*/
public static final class TSTNode {
/** Index values for accessing relatives array. */
protected final static int PARENT = 0, LOKID = 1, EQKID = 2, HIKID = 3; //父,左,中,右(相对节点数组访问下标)
/** The key to the node. */
protected Object data; //节点的值
/** The relative nodes. */
protected TSTNode[] relatives = new TSTNode[ 4]; //存放父,左,中,右4个相对节点
/** The char used in the split. */
protected char splitchar; //字符
/**
* Constructor method.
*
*@param splitchar The char used in the split. 字符
*@param parent The parent node. 父节点
*/
protected TSTNode( char splitchar, TSTNode parent) {
this.splitchar = splitchar;
relatives[PARENT] = parent;
}
public String toString()
{
return String.valueOf(splitchar) + ":" +data;
}
}
/**
* 节点元素,仅仅保存当前节点值,即文件中存的一行数据:word:interger
* key:word
* data:interger
* @author shentingting
*
*/
protected static class TSTItem {
/** 节点的值. */
protected Object data;
/** 节点对应的key.即目标字符串 */
protected String key;
/**
* Constructor method.
*
*@param key 当前节点索引的key.
*@param data 当前节点的数值.
*/
protected TSTItem(String key, Object data) {
this.key = key;
this.data = data;
}
}
/**
* Compares characters by alphabetical order.
* 按字母顺序比较字符
*@param cCompare2 The first char in the comparison. 第一个字符
*@param cRef The second char in the comparison. 第二个字符
*@return A negative number, 0 or a positive number if the second
* char is less, equal or greater.
* 当第二个字符小于第一个字符 返回 负数
* 当第二个字符等于第一个字符 返回 0
* 当第二个字符大于第一个字符 返回 正数
* ASCII码对应值:
* A-Z 65-90
* a-z 97-122
* 其中忽略了a-x之间字符的大小写敏感度,经过下面的处理后其比较字符表从Ascii码转换成
* A a B b C c .... X x Y Z y z
* 65 66 67 68.........111 112 113 114 121 122
* 至于为何YZyz四个字符没有作相同处理暂时还不知其原由
*/
private static int compareCharsAlphabetically( int cCompare2, int cRef) {
int cCompare = 0;
if (cCompare2 > = 65) { //从A开始
if (cCompare2 < 89) { //A-Y之间的字符(不包含Y)
cCompare = ( 2 * cCompare2) - 65;
} else if (cCompare2 < 97) { //在Y-a之间的字符(不包含a)
cCompare = cCompare2 + 24;
} else if (cCompare2 < 121) { //在a-y之间的字符(不包含y)
cCompare = ( 2 * cCompare2) - 128;
} else
cCompare = cCompare2;
} else //A之前的字符(不包含A)
cCompare = cCompare2;
if (cRef < 65) {
return cCompare - cRef;
}
if (cRef < 89) {
return cCompare - (( 2 * cRef) - 65);
}
if (cRef < 97) {
return cCompare - (cRef + 24);
}
if (cRef < 121) {
return cCompare - (( 2 * cRef) - 128);
}
return cCompare - cRef;
}
/** The default number of values returned by the <code>matchAlmost</code> method. */
private int defaultNumReturnValues = - 1; //默认返回值
/** the number of differences allowed in a call to the <code>matchAlmostKey</code> method. */
private int matchAlmostDiff; //
/** The base node in the trie. */
private TSTNode rootNode; //根节点
/**
* Constructs an empty Ternary Search Trie.
*/
public TernarySearchTrie() {
}
/**
* Constructs a Ternary Search Trie and loads data from a <code>File</code> into the Trie.
* The file is a normal text document, where each line is of the form
* word : integer.
*
*@param file The <code>File</code> with the data to load into the Trie.
*@exception IOException A problem occured while reading the data.
*/
public TernarySearchTrie(File file) throws IOException {
this(file,false);
}
/**
* 从文件中载入数据到字典树
* 一个普通文本文档每行的格式:word : integer
* Constructs a Ternary Search Trie and loads data from a <code>File</code> into the Trie.
* The file is a normal text document, where each line is of the form " word : integer".
*
*@param file The <code>File</code> with the data to load into the Trie.
*@param compression If true, the file is compressed with the GZIP algorithm, and if false,
* the file is a normal text document.
* true:文件根据GZIP算法压缩
* false:普通的文本文档
*@exception IOException A problem occured while reading the data.
*/
public TernarySearchTrie(File file, boolean compression) throws IOException {
this();
BufferedReader in;
//如果是压缩文件则通过建立解压缩输出流
if(compression) in = new BufferedReader( new InputStreamReader( new GZIPInputStream( new FileInputStream(file))));
else in = new BufferedReader( new InputStreamReader(( new FileInputStream(file))));
String word;
int pos;
int occur;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Stack;
import java.util.StringTokenizer;
import java.util.Vector;
import java.util.zip.GZIPInputStream;
/**
* 三叉搜索树实现:字符串排序数据结构
* Implementation of a Ternary Search Trie, a data structure for storing <code>String</code> objects
* that combines the compact size of a binary search tree with the speed of a digital search trie, and is
* therefore ideal for practical use in sorting and searching data.</p> <p>
*
* This data structure is faster than hashing for many typical search problems, and supports
* a broader range of useful problems and operations. Ternary searches are faster than
* hashing and more powerful, too.</p> <p>
*
* The theory of ternary search trees was described at a symposium in 1997 (see "Fast
* Algorithms for Sorting and Searching Strings," by J.L. Bentley and R. Sedgewick,
* Proceedings of the 8th Annual ACM-SIAM Symposium on Discrete Algorithms, January 1997).
* Algorithms in C, Third Edition, by Robert Sedgewick (Addison-Wesley, 1998) provides
* yet another view of ternary search trees.
*
* @author Bruno Martins
*
*/
public class TernarySearchTrie {
public static void main(String[] args) throws Exception {
String triefile = "E://Java Projects//ses//src//test//lucene//dic//spell//trie.txt";
TernarySearchTrie dictionary = new TernarySearchTrie( new File(triefile));
//benchMark(args[0]);
//benchMark(args[1]);
}
// 三叉树的创建,节点创建,删除等
/**
* An inner class of Ternary Search Trie that represents a node in the trie.
* 三叉树内部类代表树中的一个节点
*/
public static final class TSTNode {
/** Index values for accessing relatives array. */
protected final static int PARENT = 0, LOKID = 1, EQKID = 2, HIKID = 3; //父,左,中,右(相对节点数组访问下标)
/** The key to the node. */
protected Object data; //节点的值
/** The relative nodes. */
protected TSTNode[] relatives = new TSTNode[ 4]; //存放父,左,中,右4个相对节点
/** The char used in the split. */
protected char splitchar; //字符
/**
* Constructor method.
*
*@param splitchar The char used in the split. 字符
*@param parent The parent node. 父节点
*/
protected TSTNode( char splitchar, TSTNode parent) {
this.splitchar = splitchar;
relatives[PARENT] = parent;
}
public String toString()
{
return String.valueOf(splitchar) + ":" +data;
}
}
/**
* 节点元素,仅仅保存当前节点值,即文件中存的一行数据:word:interger
* key:word
* data:interger
* @author shentingting
*
*/
protected static class TSTItem {
/** 节点的值. */
protected Object data;
/** 节点对应的key.即目标字符串 */
protected String key;
/**
* Constructor method.
*
*@param key 当前节点索引的key.
*@param data 当前节点的数值.
*/
protected TSTItem(String key, Object data) {
this.key = key;
this.data = data;
}
}
/**
* Compares characters by alphabetical order.
* 按字母顺序比较字符
*@param cCompare2 The first char in the comparison. 第一个字符
*@param cRef The second char in the comparison. 第二个字符
*@return A negative number, 0 or a positive number if the second
* char is less, equal or greater.
* 当第二个字符小于第一个字符 返回 负数
* 当第二个字符等于第一个字符 返回 0
* 当第二个字符大于第一个字符 返回 正数
* ASCII码对应值:
* A-Z 65-90
* a-z 97-122
* 其中忽略了a-x之间字符的大小写敏感度,经过下面的处理后其比较字符表从Ascii码转换成
* A a B b C c .... X x Y Z y z
* 65 66 67 68.........111 112 113 114 121 122
* 至于为何YZyz四个字符没有作相同处理暂时还不知其原由
*/
private static int compareCharsAlphabetically( int cCompare2, int cRef) {
int cCompare = 0;
if (cCompare2 > = 65) { //从A开始
if (cCompare2 < 89) { //A-Y之间的字符(不包含Y)
cCompare = ( 2 * cCompare2) - 65;
} else if (cCompare2 < 97) { //在Y-a之间的字符(不包含a)
cCompare = cCompare2 + 24;
} else if (cCompare2 < 121) { //在a-y之间的字符(不包含y)
cCompare = ( 2 * cCompare2) - 128;
} else
cCompare = cCompare2;
} else //A之前的字符(不包含A)
cCompare = cCompare2;
if (cRef < 65) {
return cCompare - cRef;
}
if (cRef < 89) {
return cCompare - (( 2 * cRef) - 65);
}
if (cRef < 97) {
return cCompare - (cRef + 24);
}
if (cRef < 121) {
return cCompare - (( 2 * cRef) - 128);
}
return cCompare - cRef;
}
/** The default number of values returned by the <code>matchAlmost</code> method. */
private int defaultNumReturnValues = - 1; //默认返回值
/** the number of differences allowed in a call to the <code>matchAlmostKey</code> method. */
private int matchAlmostDiff; //
/** The base node in the trie. */
private TSTNode rootNode; //根节点
/**
* Constructs an empty Ternary Search Trie.
*/
public TernarySearchTrie() {
}
/**
* Constructs a Ternary Search Trie and loads data from a <code>File</code> into the Trie.
* The file is a normal text document, where each line is of the form
* word : integer.
*
*@param file The <code>File</code> with the data to load into the Trie.
*@exception IOException A problem occured while reading the data.
*/
public TernarySearchTrie(File file) throws IOException {
this(file,false);
}
/**
* 从文件中载入数据到字典树
* 一个普通文本文档每行的格式:word : integer
* Constructs a Ternary Search Trie and loads data from a <code>File</code> into the Trie.
* The file is a normal text document, where each line is of the form " word : integer".
*
*@param file The <code>File</code> with the data to load into the Trie.
*@param compression If true, the file is compressed with the GZIP algorithm, and if false,
* the file is a normal text document.
* true:文件根据GZIP算法压缩
* false:普通的文本文档
*@exception IOException A problem occured while reading the data.
*/
public TernarySearchTrie(File file, boolean compression) throws IOException {
this();
BufferedReader in;
//如果是压缩文件则通过建立解压缩输出流
if(compression) in = new BufferedReader( new InputStreamReader( new GZIPInputStream( new FileInputStream(file))));
else in = new BufferedReader( new InputStreamReader(( new FileInputStream(file))));
String word;
int pos;
int occur;