Tire Tree,又名字典树,是一种树形结构。常用于统计、排序和保存大量的字符串,比如说,在自然语言处理中,常用来统计词频。优点是查询效率高。
Trie Tree 具有以下三个性质:
1. 根节点不包含字符,除根节点意外每个节点只包含一个字符;
2. 从根节点到某一个节点,路径上经过的字符连接起来,为该节点对应的字符串;
3. 每个节点的所有子节点包含的字符串不相同。
- Tire Tree 的基本实现
package com.feng.nlp.changedalgorithm;
import lombok.Data;
import org.apache.commons.lang3.StringUtils;
import java.util.HashMap;
import java.util.Map;
/**
* TrieTree 字典树实现
* <p/>
* Created by lionel on 17/4/15.
*/
public class TrieTree {
private TrieNode root;
public TrieTree() {
root = new TrieNode();
}
/**
* 往字典树插入单词
*
* @param word 单词
*/
public void insert(String word) {
insert(root, word);
}
private void insert(TrieNode root, String word) {
if (StringUtils.isBlank(word)) {
return;
}
word = word.toLowerCase();
char[] characters = word.toCharArray();
int length = characters.length;
for (int i = 0; i < length; i++) {
int index = characters[i] - '0';
if (root.childs[index] != null) {
root.childs[index].prefixNum++;
} else {
root.childs[index] = new TrieNode(characters[i]);
}
if (i == length - 1) {
root.childs[index].isleaf = true;
root.childs[index].num++;
}
root = root.childs[index];
}
}
/**
* 遍历字典树 ,找出所有单词及其词频
*
* @return 所有单词及其词频
*/
public HashMap<String, Integer> getAllWords() {
return preOrderTraversal(this.root, "");
}
private HashMap<String, Integer> preOrderTraversal(TrieNode root, String prefixs) {
HashMap<String, Integer> map = new HashMap<String, Integer>();
if (root == null) {
return map;
}
if (root.isleaf) {
map.put(prefixs, root.getNum());
}
for (int i = 0; i < root.childs.length; i++) {
if (root.childs[i] != null) {
char ch = root.childs[i].getCharacter();
String tmpStr = prefixs + ch;
map.putAll(preOrderTraversal(root.childs[i], tmpStr));
}
}
return map;
}
/**
* 查询某字符串是否在字典树种
*
* @param word 单词
* @return 在,返回 true;不在,返回 false
*/
public boolean isExist(String word) {
return isExist(this.root, word);
}
private boolean isExist(TrieNode root, String word) {
if (StringUtils.isBlank(word)) {
return false;
}
char[] characters = word.toLowerCase().toCharArray();
for (char character : characters) {
int index = character - '0';
if (root.childs[index] == null) {
return false;
}
root = root.childs[index];
}
return true;
}
/**
* 得到以某字串为前缀的字串集,包括字串本身! 类似单词输入法的联想功能
*
* @param prefix 字串前缀
* @return 串集以及出现次数
*/
public Map<String, Integer> getWordsFroPrefix(String prefix) {
return getWordsFroPrefix(this.root, prefix);
}
private Map<String, Integer> getWordsFroPrefix(TrieNode root, String prefix) {
Map<String, Integer> map = new HashMap<String, Integer>();
if (StringUtils.isBlank(prefix)) {
return map;
}
char[] characters = prefix.toLowerCase().toCharArray();
for (char character : characters) {
int index = character - '0';
if (root.childs[index] == null) {
return null;
}
root = root.childs[index];
}
return preOrderTraversal(root, prefix);
}
@Data
private class TrieNode {
private char character;//节点存储字符
private int num;//该词的出现次数
private int prefixNum;//以该字串为前缀的字串数, 应该包括该字串本身
private TrieNode[] childs;//节点孩子
private boolean isleaf;
public TrieNode() {
character = '~';
num = 0;
prefixNum = 0;
isleaf = false;
childs = new TrieNode[40911];
}
public TrieNode(char character) {
this.character = character;
num = 0;
prefixNum = 1;
isleaf = false;
childs = new TrieNode[40911];
}
}
}