一、概述
1、基本概念
字典树,又称为单词查找树,Tire数,是一种树形结构,它是一种哈希树的变种。
2、基本性质
- 根节点不包含字符,除根节点外的每一个子节点都包含一个字符
- 从根节点到某一节点。路径上经过的字符连接起来,就是该节点对应的字符串
- 每个节点的所有子节点包含的字符都不相同
3、应用场景
典型应用是用于统计,排序和公共字符串(不仅限于字符串),经常被搜索引擎系统用于文本词频统计。
1)、字符串的快速查找
给出N个单词组成的熟词表,以及一篇全用小写英文书写的文章,请你按最早出现的顺序写出所有不在熟词表中的生词。 在这道题中,我们可以用数组枚举,用哈希,用字典树,先把熟词建一棵树,然后读入文章进行比较,这种方法效率是比较高 的。
2)、字典树在“串”排序方面的应用
给定N个互不相同的仅由一个单词构成的英文名,让你将他们按字典序从小到大输出用字典树进行排序,采用数组的方式创 建字典树,这棵树的每个节点的所有儿子很显然地按照其字母大小排序,对这棵树进行先序遍历即可。
3)、字典树在最长公共前缀问题的应用
对所有串建立字典树,对于两个串的最长公共前缀的长度即他们所在的节点的公共
4、优点
利用字符串的公共前缀来减少查询时间,最大限度的减少无谓的字符串比较,查询效率比哈希树高。
实例: 统计含有相同前缀的单词个数的实现
以属性prefixCnt来统计,经过该节点的单词一共有多少个,这样,在给定一个前缀的时候,按照查询字符串的方法,一步步往下查找,途中出现未找到的字符,则返回0。否则到达包含前缀的最后一个字符的节点,直接返回该节点的prefixCnt的值即可。然而该值是在进行插入单词时更新的,目前的处理只能针对没有重复插入同一个单词的情况。
代码1:
#include <queue>
#include <iostream>
using namespace std;
const int size = 26;
struct TrieTreeNode
{
char val;
bool isEnd;
int childCnt;
int prefixCnt;
TrieTreeNode *child[size];
TrieTreeNode(char _val)
:val(_val),isEnd(false),childCnt(0),prefixCnt(0)
{
memset(child,NULL,sizeof(child));//not 26!!
}
};
void Insert(TrieTreeNode *&root, const char *word)
{
TrieTreeNode *p = root;
for (int i = 0; i < strlen(word); i++)
{
if(p->child[word[i]-'a'] == NULL)
{
p->child[word[i]-'a'] = new TrieTreeNode(word[i]);
p->childCnt++;
}
//notice!!this line need to be optimized to handle with duplicated insertion
p->child[word[i]-'a']->prefixCnt++;
p = p->child[word[i]-'a'];
}
p->isEnd = true;
}
bool Find(TrieTreeNode *root, const char *word)
{
TrieTreeNode *p = root;
for (int i = 0; i < strlen(word); i++)
{
if (p->child[word[i]-'a'] == NULL)
return false;
p = p->child[word[i]-'a'];
}
return p->isEnd;
}
void LevelOrderTraverse(TrieTreeNode *root)
{
if(root == NULL)
return;
queue<TrieTreeNode *> Q;
Q.push(root);
while (!Q.empty())
{
TrieTreeNode *p = Q.front();
cout << p->val << "(" << p->childCnt << ") ";
for (int i = 0; i < size; i++)
{
if(p->child[i] != NULL)
Q.push(p->child[i]);
}
Q.pop();
}
cout << "\n";
}
void PreOrderTraverse(TrieTreeNode *treeNode)
{
if (treeNode != NULL)
{
cout << treeNode->val << "(" << treeNode->childCnt << ") ";
for (int i = 0; i < size; i++)
{
PreOrderTraverse(treeNode->child[i]);
}
}
}
void PostOrderTraverse(TrieTreeNode *treeNode)
{
if (treeNode != NULL)
{
for (int i = 0; i < size; i++)
{
PostOrderTraverse(treeNode->child[i]);
}
cout << treeNode->val << "(" << treeNode->childCnt << ") ";
}
}
void MakeEmpty(TrieTreeNode *&treeNode)
{
if (treeNode != NULL)
{
for (int i = 0; i < size; i++)
{
MakeEmpty(treeNode->child[i]);
}
delete treeNode;
}
treeNode = NULL;
}
void BuildTrieTree(TrieTreeNode *&root,const char *words[], int n)
{
for (int i = 0; i < n; i++)
{
Insert(root,words[i]);
}
}
bool Remove(TrieTreeNode *&treeNode, const char *word,int pos, int n)
{
if (pos == n)
{
treeNode->isEnd = false;//set the node not to be an end
//if the last node contains the last char is a leaf,return true to delete it
return treeNode->childCnt == 0;
}
//not found, not delete this node
if (treeNode->child[word[pos]-'a'] == NULL)
return false;
//if true, the child is a leaf, delete the child
if ( Remove( treeNode->child[word[pos]-'a'], word, pos+1, n))
{
delete treeNode->child[word[pos]-'a'];
treeNode->child[word[pos]-'a'] = NULL;
treeNode->prefixCnt--;
//if the node becomes a leaf and is not an end,return true to delete it
if (--treeNode->childCnt == 0 && treeNode->isEnd == false)
return true;
}
//other not delete
return false;
}
//Count the number of words which contain the specific prefix
int CountWordsWithPrefix(TrieTreeNode *root, const char *prefix)
{
TrieTreeNode *p = root;
for (int i = 0; i < strlen(prefix); i++)
{
if(p->child[prefix[i]-'a'] == NULL)
return 0;
p = p->child[prefix[i]-'a'];
}
return p->prefixCnt;
}
int main()
{
TrieTreeNode *root = new TrieTreeNode('\0');
//const char *words[] = {"b","abc","abd","bcd","abcd","efg","hii"}; //test insert
//cout << sizeof(words) << "\n";//(4*7=28)
//cout << sizeof(words[0]) << "\n";//4(is a pointer)
const char *words[] = {"abc","ad","ef"};//test remove
BuildTrieTree(root,words,sizeof(words)/sizeof(words[0]));
LevelOrderTraverse(root);
PreOrderTraverse(root);
cout << '\n';
PostOrderTraverse(root);
cout << "\n";
if (Find(root,"ef"))
cout << "ef found" << endl;
else cout << "ef not found" <<endl;
Insert(root,"e");
//after this insertion.the node 'e' becomes a end but it's not a leaf,
//so it can not be deleted unless its leaf(leaves) is deleted
LevelOrderTraverse(root);
Remove(root,"ef",0,strlen("ef"));
LevelOrderTraverse(root);
Remove(root,"e",0,strlen("e"));
LevelOrderTraverse(root);
cout << CountWordsWithPrefix(root,"a")<<endl;
Remove(root,"ad",0,strlen("ad"));
cout << CountWordsWithPrefix(root,"a")<<endl;
MakeEmpty(root);
return 0;
}