最近测试搜索引擎,发现用基本词汇分词出来在索引检索会有检索词汇被细分,造成检索到的文章是七零八落的字组成的,经过几天分析,发现通过在基础词汇分词的基础上,再组合分词会巨大的提高新词发现率,词语发现准确率,完整词汇发现率。
在基础分词后,组合分词算法如下:
func RelaToWordList(rela map[string]*WordRelaInfo) (relawordls []string) {
for w1, rela1 := range rela {
if w1 == "的" || w1 == "是" || w1 == "了" {
w1 = ""
}
for w2, rela2 := range rela1.next {
bshow3 := false
maxcnt3 := 0
for w3, rela3 := range rela2.next {
bshow4 := false
maxcnt4 := 0
for w4, rela4 := range rela3.next {
bshow5 := false
maxcnt5 := 0
for w5, rela5 := range rela4.next {
if rela5.cnt > 1 {
relawordls = append(relawordls, w1+w2+w3+w4+w5)
bshow5 = true
if rela5.cnt > maxcnt4 {
maxcnt5 = rela5.cnt
}
}
}
if bshow5 == false /*|| rela4.cnt > maxcnt5*/ {
if rela4.cnt > 1 {
relawordls = append(relawordls, w1+w2+w3+w4)
bshow4 = true
if rela4.cnt > maxcnt4 {
maxcnt4 = rela4.cnt
}
}
}
if maxcnt5 > maxcnt4 {
maxcnt4 = maxcnt5
}
}
if bshow4 == false /*|| rela3.cnt > maxcnt4*/ {
if rela3.cnt > 1 {
relawordls = append(relawordls, w1+w2+w3)
bshow3 = true
}
}
if maxcnt4 > maxcnt3 {
maxcnt3 = maxcnt4
}
}
if bshow3 == false /*|| rela2.cnt > maxcnt3*/ {
if rela2.cnt > 1 {
if w1 != "" {
relawordls = append(relawordls, w1+w2)
}
}
}
}
}
return relawordls
}