觉得功能强大,特别好用.对它的查找和优先级怎么处理有点兴趣,花时间研究了下源码,在这记录一下个人理解.
package main
//author:xcl
//2014-1-20 记录
import (
"fmt"
"strings"
)
func main(){
patterns := []string{
"y","25",
"中","国",
"中工","家伙",
}
/*
patterns := make([]string,270 * 2)
for i :=0;i< 270 *2;i++{
patterns[i] = fmt.Sprintf("%d",i)
}
*/
replacer := strings.NewReplacer(patterns...)
format := "中(国)--中工(家伙)"
strfmt := replacer.Replace(format)
NewReplacer(patterns...);
fmt.Println("\nmain() replacer.Replace old=",format)
fmt.Println("main() replacer.Replace new=",strfmt)
}
func NewReplacer(oldnew ...string){
r := makeGenericReplacer(oldnew)
val,keylen,found := r.lookup("中",true)
fmt.Println("\nNewReplacer() 中 val:",val," keylen:",keylen," found:",found)
val,keylen,found = r.lookup("中工",true)
fmt.Println("NewReplacer() 中工 val:",val," keylen:",keylen," found:",found)
val,keylen,found = r.lookup("y",false)
fmt.Println("NewReplacer() y val:",val," keylen:",keylen," found:",found)
/*
val,keylen,found := r.lookup("2",true)
fmt.Println("\nNewReplacer() 2 val:",val," keylen:",keylen," found:",found)
val,keylen,found = r.lookup("3",true)
fmt.Println("\nNewReplacer() 3 val:",val," keylen:",keylen," found:",found)
*/
}
type genericReplacer struct {
root trieNode //一个字典树
// tableSize is the size of a trie node's lookup table. It is the number
// of unique key bytes.
tableSize int
// mapping maps from key bytes to a dense index for trieNode.table.
mapping [256]byte
}
func makeGenericReplacer(oldnew []string) *genericReplacer {
r := new(genericReplacer)
// Find each byte used, then assign them each an index.
for i := 0; i < len(oldnew); i += 2 { //步长2. 第一个为pattern
key := oldnew[i]
fmt.Println("\nmakeGenericReplacer() for key=",key)
//key[j]=utf8存储汉字的三个编码位置中的一个如228,则将其对应位置设置为1
//即 r.mapping[228] = 1
for j := 0; j < len(key); j++ {
r.mapping[key[j]] = 1
fmt.Println("makeGenericReplacer() key[",j,"]=",key[j])
}
}
for _, b := range r.mapping {
r.tableSize += int(b)
}
fmt.Println("makeGenericReplacer() r.tableSize=",r.tableSize)
var index byte
for i, b := range r.mapping {
if b == 0 {
r.mapping[i] = byte(r.tableSize)
} else {
//依数组字符编码位置,建立索引
r.mapping[i] = index
fmt.Println("makeGenericReplacer() r.mapping[",i,"] =",r.mapping[i] )
index++
}
}
// Ensure root node uses a lookup table (for performance).
r.root.table = make([]*trieNode, r.tableSize)
//将key,val放入字典树,注意priority=len(oldnew)-i,即越数组前面的,值越大.级别越高
for i := 0; i < len(oldnew); i += 2 {
r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r)
}
return r
}
type trieNode struct {
value string
priority int
prefix string
next *trieNode
table []*trieNode
}
func (t *trieNode) add(key, val string, priority int, r *genericReplacer) {
fmt.Println("trieNode->add() val=",val," key=",key)
if key == "" {
if t.priority == 0 {
t.value = val
t.priority = priority
fmt.Println("trieNode->add() t.priority==",priority)
}
return
}
if t.prefix != "" { //处理已有前缀的node
// Need to split the prefix among multiple nodes.
var n int // length of the longest common prefix
for ; n < len(t.prefix) && n < len(key); n++ { //prefix与key的比较
if t.prefix[n] != key[n] {
break
}
}
if n == len(t.prefix) { //相同,继续放下面
t.next.add(key[n:], val, priority, r)
} else if n == 0 { //没一个相同
// First byte differs, start a new lookup table here. Looking up
// what is currently t.prefix[0] will lead to prefixNode, and
// looking up key[0] will lead to keyNode.
var prefixNode *trieNode
if len(t.prefix) == 1 { //如果prefix只是一个字节的字符编码,则挂在节点下面
prefixNode = t.next
} else { //如果不是,将余下的新建一个trie树
prefixNode = &trieNode{
prefix: t.prefix[1:],
next: t.next,
}
}
keyNode := new(trieNode)
t.table = make([]*trieNode, r.tableSize) //lookup()中的if node.table != nil
t.table[r.mapping[t.prefix[0]]] = prefixNode
t.table[r.mapping[key[0]]] = keyNode
t.prefix = ""
t.next = nil
keyNode.add(key[1:], val, priority, r)
} else {
// Insert new node after the common section of the prefix.
next := &trieNode{
prefix: t.prefix[n:],
next: t.next,
}
t.prefix = t.prefix[:n]
t.next = next
next.add(key[n:], val, priority, r)
}
} else if t.table != nil {
// Insert into existing table.
m := r.mapping[key[0]]
if t.table[m] == nil {
t.table[m] = new(trieNode)
}
t.table[m].add(key[1:], val, priority, r) //构建树
} else {
t.prefix = key
t.next = new(trieNode)
t.next.add("", val, priority, r)
}
}
func (r *genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int,found bool) {
// Iterate down the trie to the end, and grab the value and keylen with
// the highest priority.
bestPriority := 0
node := &r.root
n := 0
for node != nil {
if node.priority > bestPriority && !(ignoreRoot && node == &r.root) {
bestPriority = node.priority
val = node.value
keylen = n
found = true
}
if s == "" {
break
}
if node.table != nil {
index := r.mapping[s[0]]
if int(index) == r.tableSize { //字符编码第一个字节就没在table中,中断查找
break
}
node = node.table[index]
s = s[1:]
n++
} else if node.prefix != "" && HasPrefix(s, node.prefix) {
//字符编码非第一个字节的节点会保留key在prefix中,所以通过分析prefix来继续找其它字节
n += len(node.prefix)
s = s[len(node.prefix):]
node = node.next //继续找相同prefix以外其它字符
} else {
break
}
}
return
}
// HasPrefix tests whether the string s begins with prefix.
func HasPrefix(s, prefix string) bool {
return len(s) >= len(prefix) && s[0:len(prefix)] == prefix
}
记录:
ascii范围内的只占一个字节,如y(121)
utf8中每个汉字占三个字节.如中(228,184,173)
构建树:
如果是新的第一个单词或词组
先进 } else if t.table != nil {
然后再进 else,这中间会把 t.prefix = key,把key值存放在prefix,将""传给下一个node
最后执行 if key == "" && t.priority == 0 { ,将 t.value = val
即key的字符编码(第一个字节)对应的root.table位置开始,依次指向另外的字符编译node,中间node的prefix存下key值.
最末一个node,存下对应的val及priority.
如果是后传入的单词或词组,先从key字符编码首个字节对应的root.table位置开始,依次查找,
} else if t.table != nil {
如果已有前缀的,进行比较 if t.prefix != "" {
1, 如目前prefix与key完全一致,则继续构建树子节点
2. 如prefix与key完全不同,则另起炉灶,构建一条新的tree
prefixNode 承上,keyNode 启下
至于为什么t.table = make([]*trieNode, r.tableSize),是为了预留映射空间.
所以它是这么弄的,而不是t.table[0],t.table[1].
t.table[r.mapping[t.prefix[0]]] = prefixNode
t.table[r.mapping[key[0]]] = keyNode
3.有部份相同, 直接跳到t.prefix[n:],然后从key[n:]开始继续构建树子节点
priority:
在这的定义是数字越大,优先级别越高
if key == "" { //字符编码中间的字节
if t.priority == 0 { //如果有定义过priority的就略过,新加的,把现有的级别加上
//对应{中,中工}这种,虽然后面有"中工",但"中",的priority要高,所以"中工"对应的值虽找到但不会返回.
if node.priority > bestPriority { bestPriority = node.priority}
例如:中工(priority=4),中(priority=2)
patterns:
"中工","家伙",
"中","国",
则:
lookup() bestPriority: 0 node.priority: 0 value: prefix:
lookup() bestPriority: 0 node.priority: 0 value: prefix: ��
lookup() bestPriority: 0 node.priority: 2 value: 国 prefix: 工
NewReplacer() 中 val: 国 keylen: 3 found: true
lookup() bestPriority: 0 node.priority: 0 value: prefix:
lookup() bestPriority: 0 node.priority: 0 value: prefix: ��
lookup() bestPriority: 0 node.priority: 2 value: 国 prefix: 工
lookup() bestPriority: 2 node.priority: 4 value: 家伙 prefix:
NewReplacer() 中工 val: 家伙 keylen: 6 found: true
main() replacer.Replace old= 中(国)--中工(家伙)
main() replacer.Replace new= 国(国)--家伙(家伙)
如果调整下顺序,把中->国提前,则会发现,下面的结果:
patterns:
"中","国",
"中工","家伙",
则:
lookup() bestPriority: 0 node.priority: 0 value: prefix:
lookup() bestPriority: 0 node.priority: 0 value: prefix: ��
lookup() bestPriority: 0 node.priority: 4 value: 国 prefix: 工
NewReplacer() 中 val: 国 keylen: 3 found: true
lookup() bestPriority: 0 node.priority: 0 value: prefix:
lookup() bestPriority: 0 node.priority: 0 value: prefix: ��
lookup() bestPriority: 0 node.priority: 4 value: 国 prefix: 工
lookup() bestPriority: 4 node.priority: 2 value: 家伙 prefix:
NewReplacer() 中工 val: 国 keylen: 3 found: true
main() replacer.Replace old= 中(国)--中工(家伙)
main() replacer.Replace new= 国(国)--国工(家伙)
还有,刚发现 lookup(s string, ignoreRoot bool) (val string, keylen int,found bool) {}中
定义在返回值中的变量,原来可以直接在函数中使用,
至于返回,直接return就行了,都不用写全返回值的,好省事.
MAIL: xcl_168@aliyun.com
BLOG:http://blog.csdn.net/xcl168