源码分析ik歧义处理

最新推荐文章于 2024-07-24 20:03:15 发布

thriller_zw

最新推荐文章于 2024-07-24 20:03:15 发布

阅读量118

点赞数

分类专栏：搜索文章标签：数据结构与算法

搜索专栏收录该内容

10 篇文章 0 订阅

订阅专栏

1、例子：
分词串：这是一个中文分词的例子
采用智能分词
有重合元素即为相交
词元链：2--4 相交词元：2 - 3 3-5 3-4 不相交 4-6
2-4和3-5相交，3-5又和4-6相交
词元链LexemePath crossPath对象：
pathBegin : 2
pathEnd : 6
payloadLength : 4
lexeme : 2-4 : 一个 : CN_WORD
lexeme : 2-3 : 一 : TYPE_CNUM
lexeme : 3-5 : 个中 : CN_WORD
lexeme : 3-4 : 个 : COUNT
lexeme : 4-6 : 中文 : CN_WORD
//候选路径集合，第一个2--6作为最优结果
TreeSet<LexemePath> pathOptions = new TreeSet<LexemePath>();
[pathBegin : 2
pathEnd : 6
payloadLength : 4
lexeme : 2-4 : null : CN_WORD
lexeme : 4-6 : null : CN_WORD
, pathBegin : 3
pathEnd : 6
payloadLength : 3
lexeme : 3-4 : null : COUNT
lexeme : 4-6 : null : CN_WORD
, pathBegin : 2
pathEnd : 5
payloadLength : 3
lexeme : 2-3 : null : TYPE_CNUM
lexeme : 3-5 : null : CN_WORD
, pathBegin : 3
pathEnd : 5
payloadLength : 2
lexeme : 3-5 : null : CN_WORD
]
歧义处理结果：
pathBegin : 2
pathEnd : 6
payloadLength : 4
lexeme : 2-4 : 一个 : CN_WORD
lexeme : 4-6 : 中文 : CN_WORD
2、源码分析歧义处理
从头遍历词元链crossPath,生成一条新的不包含相交的词元的链（新的无歧义词元组合），并找到所有相交的Lexeme栈lexemeStack。
遍历lexemeStack把相交的Lexeme作为起始位置，生成新的无歧义词元组合。
把无歧义词元组合加入候选路径集合TreeSet<LexemePath> pathOptions排序，排序规则包括“有效文本长度”等。把第一个作为结果。

//对当前的crossPath进行歧义处理
QuickSortSet.Cell headCell = crossPath.getHead();
LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength());
TreeSet<LexemePath> pathOptions使用LexemePath的compareTo方法排序，pathOptions.first()作为最优结果。
/**
  * 歧义识别
  * @param lexemeCell 歧义路径链表头
  * @param fullTextLength 歧义路径文本长度
  * @param option 候选结果路径
  * @return
  */
 private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){
  //候选路径集合
  TreeSet<LexemePath> pathOptions = new TreeSet<LexemePath>();
  //候选结果路径
  LexemePath option = new LexemePath();
 
  //对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈。
  //zw:链中第一个元素为2-4 : null : CN_WORD 所以相交于 2-3 : null : TYPE_CNUM ， 3-5 : null : CN_WORD ， 3-4 : null : COUNT
  Stack<QuickSortSet.Cell> lexemeStack = this.forwardPath(lexemeCell , option);
 
  //当前词元链并非最理想的，加入候选路径集合
  pathOptions.add(option.copy());
 
  //存在歧义词，处理
  QuickSortSet.Cell c = null;
  while(!lexemeStack.isEmpty()){
   c = lexemeStack.pop();
   //回滚词元链
   this.backPath(c.getLexeme() , option);
   //从歧义词位置开始，递归，生成可选方案
   this.forwardPath(c , option);
   pathOptions.add(option.copy());
  }
 
  //返回集合中的最优方案
  return pathOptions.first();
 }
 
 /**
  * 向前遍历，添加词元，构造一个无歧义词元组合
  * @param LexemePath path
  * @return
  */
 private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){
  //发生冲突的Lexeme栈
  Stack<QuickSortSet.Cell> conflictStack = new Stack<QuickSortSet.Cell>();
  QuickSortSet.Cell c = lexemeCell;
  //迭代遍历Lexeme链表
  while(c != null && c.getLexeme() != null){
   if(!option.addNotCrossLexeme(c.getLexeme())){
    //词元交叉，添加失败则加入lexemeStack栈
    conflictStack.push(c);
   }
   c = c.getNext();
  }
  return conflictStack;
 }
 
 
 /**
  * 回滚词元链，直到它能够接受指定的词元
  * @param lexeme
  * @param l
  */
 private void backPath(Lexeme l , LexemePath option){
  while(option.checkCross(l)){
   option.removeTail();
  }
 
 }
 
 /**
  * 向LexemePath追加不相交的Lexeme
  * @param lexeme
  * @return
  */
 boolean addNotCrossLexeme(Lexeme lexeme){
  if(this.isEmpty()){
   this.addLexeme(lexeme);
   this.pathBegin = lexeme.getBegin();
   this.pathEnd = lexeme.getBegin() + lexeme.getLength();
   this.payloadLength += lexeme.getLength();
   return true;
   
  }else if(this.checkCross(lexeme)){
   return false;
   
  }else{
   this.addLexeme(lexeme);
   this.payloadLength += lexeme.getLength();
   Lexeme head = this.peekFirst();
   this.pathBegin = head.getBegin();
   Lexeme tail = this.peekLast();
   this.pathEnd = tail.getBegin() + tail.getLength();
   return true;
   
  }
 }
     
/**
 * Lexeme链（路径）
 */
class LexemePath extends QuickSortSet implements Comparable<LexemePath>
public int compareTo(LexemePath o) {
  //比较有效文本长度
  if(this.payloadLength > o.payloadLength){
   return -1;
  }else if(this.payloadLength < o.payloadLength){
   return 1;
  }else{
   //比较词元个数，越少越好
   if(this.size() < o.size()){
    return -1;
   }else if (this.size() > o.size()){
    return 1;
   }else{
    //路径跨度越大越好
    if(this.getPathLength() > o.getPathLength()){
     return -1;
    }else if(this.getPathLength() < o.getPathLength()){
     return 1;
    }else {
     //根据统计学结论，逆向切分概率高于正向切分，因此位置越靠后的优先
     if(this.pathEnd > o.pathEnd){
      return -1;
     }else if(pathEnd < o.pathEnd){
      return 1;
     }else{
      //词长越平均越好
      if(this.getXWeight() > o.getXWeight()){
       return -1;
      }else if(this.getXWeight() < o.getXWeight()){
       return 1;
      }else {
       //词元位置权重比较
       if(this.getPWeight() > o.getPWeight()){
        return -1;
       }else if(this.getPWeight() < o.getPWeight()){
        return 1;
       }
       
      }
     }
    }
   }
  }
  return 0;
 }

thriller_zw

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
源码分析ik歧义处理

1、例子：分词串：这是一个中文分词的例子采用智能分词有重合元素即为相交词元链：2--4 相交词元：2 - 3 3-5 3-4 不相交 4-62-4和3-5相交，3-5又和4-6相交词元链LexemePath crossPath对象：pathBegin : 2pathEnd : 6payloadLength : 4lexeme : 2-4 : 一个 : CN_WORDlexeme : 2-3...
复制链接

扫一扫

专栏目录