做一个简单的中文字典树,稍比英文的复杂一些,英文无非26个树杈而且排序方便,用数组表示树杈即可
而中文相对复杂一些,一个中文2个字节一个char就够了,但是树杈个数不定,对树杈进行排序用数组来做也相对复杂,因此考虑用二叉搜索树代替数组来做子节点的存储结构
一个字典树节点链接一个二叉搜索树,二叉树上每个支上又挂载一个字典树节点
部分代码如下:
package com.cyj.analysis;
/**
* 汉字字典树
*
* @author Administrator
*
*/
public class Trie {
// 子节点
private BSTree children;
// 该节点存储的数据
private Character nodeValue;
// 是否词尾 0非词尾 1词尾
private int nodeState = 0;
public Trie(Character value) {
if (value == null)
throw new IllegalArgumentException("参数异常不能为空");
this.nodeValue = value;
}
/**
* 匹配词段
*
* @param charArray
* @return Hit
*/
public Hit match(char[] charArray, int begin, int length, Hit searchHit) {
if (searchHit == null) {
searchHit = new Hit();
searchHit.setBegin(begin);
} else {
searchHit.setUnmatch();
}
searchHit.setEnd(begin);
Character keyChar = new Character(charArray[begin]);
Trie ds = null;
BSTree segmentArray = this.children;
if (segmentArray != null) {
Trie keySegment = new Trie(keyChar);
BSTree bs = null;
if ((bs = segmentArray.findNode(segmentArray, keySegment)) != null)
ds = bs.getValue();
}
if (ds != null) {
if (length > 1) {
// 字符未匹配完
return ds.match(charArray, begin + 1, length - 1, searchHit);
} else if (length == 1) {
if (ds.nodeState == 1) {
searchHit.setMatch();
}
if (ds.children != null) {
searchHit.setPrefix();
searchHit.setMatchedDictSegment(ds);
}
return searchHit;
}
}
return searchHit;
}
/**
* 添加
*/
public synchronized void fillSegment(char[] charArray, int begin, int length) {
Character beginChar = charArray[begin];
Trie trie = new Trie(beginChar);
if (length == 1) {
trie.nodeState = 1;
}
BSTree bs = children;
if (children == null) {
children = new BSTree(trie);
}
BSTree node = children.findNode(children, trie);
if (node == null) {
children.addNode(children, trie);
node = children.findNode(children, trie);
}
if (length > 1) {
node.getValue().fillSegment(charArray, begin + 1, length - 1);
}
}
public BSTree getChildren() {
return children;
}
public Character getNodeValue() {
return nodeValue;
}
public int getNodeState() {
return nodeState;
}
}
package com.cyj.analysis;
public class BSTree {
private BSTree left;
private BSTree right;
private int freqs;
private Trie value;
public BSTree(Trie value){
this.value = value;
this.left = null;
this.right = null;
this.freqs = 1;
}
/**
* 添加节点
* @param src
* @param value
* @return
*/
public BSTree addNode(BSTree src,Trie value){
if(src == null){
src = new BSTree(value);
}else if(src.value.getNodeValue() > value.getNodeValue()){
src.left = addNode(src.left, value);
}else if(src.value.getNodeValue() < value.getNodeValue()){
src.right = addNode(src.right, value);
}else
src.freqs += 1;
balence();
return src;
}
/**
* 查找节点
* @param src
* @param value
* @return
*/
public BSTree findNode(BSTree src,Trie value){
if(src == null){
return null;
}
BSTree node = null;
if(value.getNodeValue() < src.value.getNodeValue()){
node = findNode(src.left,value);
}else if(value.getNodeValue() > src.value.getNodeValue()){
node = findNode(src.right,value);
}else
node = src;
return node;
}
public BSTree getLeft() {
return left;
}
public BSTree getRight() {
return right;
}
public int getFreqs() {
return freqs;
}
public Trie getValue() {
return value;
}
/**
* 平衡方法,暂不考虑
*/
private void balence(){
}
}
测试类
package com.cyj.analysis.test;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import com.cyj.analysis.Hit;
import com.cyj.analysis.Trie;
public class Main {
//测试字典,已通过
public static void main(String[] args) throws IOException {
BufferedReader br = new BufferedReader(new FileReader(new File("C:\\Users\\Administrator\\Desktop\\sougou.txt")));
String content = null;
Trie trie = new Trie('0');
while((content = br.readLine())!=null){
content = new String(content.getBytes("GBK"),"UTF-8");
String words = content.split("[\t ]+")[0];
trie.fillSegment(words.toCharArray(), 0, words.length());
}
br.close();
Hit searchHit = new Hit();
searchHit = trie.match("阿猫".toCharArray(), 0, 2, searchHit);
System.out.println(searchHit.isMatch());
}
}