lucene IKAnalyzer中文分词器学习（2）

最新推荐文章于 2021-02-21 02:36:58 发布

weixin_34208283

最新推荐文章于 2021-02-21 02:36:58 发布

阅读量95

点赞数

文章标签： python 数据结构与算法

原文链接：https://my.oschina.net/zhuqianli/blog/1583693

版权

2019独角兽企业重金招聘Python工程师标准>>>

Analyzer analyzer = new IKAnalyzer5x(true);
TokenStream tokenStream = analyzer.tokenStream("test", "一个新款韩版长袖羊驼绒羊毛皮草外套女海宁皮草");
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) { // 查看IKTokenizer5x对incrementToken()这个方法的实现
    System.out.println(offsetAttribute.toString());
}

public boolean incrementToken() throws IOException {
    this.clearAttributes();
    Lexeme nextLexeme = this._IKImplement.next(); // 获取下一个词远 查看这个方法 
    if(nextLexeme != null) {
        this.termAtt.append(nextLexeme.getLexemeText());
        this.termAtt.setLength(nextLexeme.getLength());
        this.offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
        this.endPosition = nextLexeme.getEndPosition();
        this.typeAtt.setType(nextLexeme.getLexemeTypeString());
        return true;
    } else {
        return false;
    }
}

public synchronized Lexeme next() throws IOException {
  Lexeme l = null;
  while ((l = context.getNextLexeme()) == null) {
    /*
     * 从reader中读取数据，填充buffer 如果reader是分次读入buffer的，那么buffer要 进行移位处理 移位处理上次读入的但未处理的数据
     */
    int available = context.fillBuffer(this.input);
    if (available <= 0) {
      // reader已经读完
      context.reset();
      return null;

    } else {
      // 初始化指针
      context.initCursor();
      do {
        // 遍历子分词器  主要学习一下这个子分词器
        for (ISegmenter segmenter : segmenters) {
          segmenter.analyze(context);
        }
        // 字符缓冲区接近读完，需要读入新的字符
        if (context.needRefillBuffer()) {
          break;
        }
        // 向前移动指针
      } while (context.moveCursor());
      // 重置子分词器，为下轮循环进行初始化
      for (ISegmenter segmenter : segmenters) {
        segmenter.reset();
      }
    }
    // 对分词进行歧义处理
    this.arbitrator.process(context, this.cfg.useSmart());
    // 将分词结果输出到结果集，并处理未切分的单个CJK字符
    context.outputToResult();
    // 记录本次分词的缓冲区位移
    context.markBufferOffset();
  }
  return l;
}

/**
 * 初始化词典，加载子分词器实现
 * @return List<ISegmenter>
 */
private List<ISegmenter> loadSegmenters() {
  List<ISegmenter> segmenters = new ArrayList<ISegmenter>(4);
  // 处理字母的子分词器 
  segmenters.add(new LetterSegmenter());
  // 处理中文数量词的子分词器  先学习一下这个简单一点的中文子分词器
  segmenters.add(new CN_QuantifierSegmenter());
  // 处理中文词的子分词器
  segmenters.add(new CJKSegmenter());
  return segmenters;
}

先来看自分词器的analyze方法

public void analyze(AnalyzeContext context) {
  // 处理中文数词
  this.processCNumber(context);
  // 处理中文量词
  this.processCount(context);

  // 判断是否锁定缓冲区
  if (this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()) {
    // 对缓冲区解锁
    context.unlockBuffer(SEGMENTER_NAME);
  } else {
    context.lockBuffer(SEGMENTER_NAME);
  }
}

处理中文数词

// 中文数词
private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";// Cnum
private static Set<Character> ChnNumberChars = new HashSet<Character>();
static {
  char[] ca = Chn_Num.toCharArray();
  for (char nChar : ca) {
    ChnNumberChars.add(nChar);
  }
}

private void processCNumber(AnalyzeContext context) {
  if (nStart == -1 && nEnd == -1) {// 初始状态
    if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
        && ChnNumberChars.contains(context.getCurrentChar())) {
      // 记录数词的起始、结束位置
      nStart = context.getCursor();
      nEnd = context.getCursor();
    }
  } else {// 正在处理状态
    if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
        && ChnNumberChars.contains(context.getCurrentChar())) {
      // 记录数词的结束位置
      nEnd = context.getCursor();
    } else {
      // 输出数词
      this.outputNumLexeme(context);
      // 重置头尾指针
      nStart = -1;
      nEnd = -1;
    }
  }

  // 缓冲区已经用完，还有尚未输出的数词
  if (context.isBufferConsumed()) {
    if (nStart != -1 && nEnd != -1) {
      // 输出数词
      outputNumLexeme(context);
      // 重置头尾指针
      nStart = -1;
      nEnd = -1;
    }
  }
}

// 处理中文量词

/**
 * 处理中文量词
 * @param context
 */
private void processCount(AnalyzeContext context) {
  // 判断是否需要启动量词扫描
  if (!this.needCountScan(context)) {
    return;
  }

  ......
}

/**
 * 判断是否需要扫描量词
 * @return
 */
private boolean needCountScan(AnalyzeContext context) {
  if ((nStart != -1 && nEnd != -1) || !countHits.isEmpty()) {
    // 正在处理中文数词,或者正在处理量词
    return true;
  } else {
    // 找到一个相邻的数词  处理量词之前已经处理数词了 有数词才会去出里量词
    if (!context.getOrgLexemes().isEmpty()) {
      Lexeme l = context.getOrgLexemes().peekLast();
      if (Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()) {
        if (l.getBegin() + l.getLength() == context.getCursor()) {
          return true;
        }
      }
    }
  }
  return false;
}

/**
 * 处理中文量词
 * @param context
 */
private void processCount(AnalyzeContext context) {
  // 判断是否需要启动量词扫描
  if (!this.needCountScan(context)) {
    return;
  }

  if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()) {

    // 优先处理countHits中的hit
    ......

    // *********************************
    // 对当前指针位置的字符进行单字匹配
    Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(),
      context.getCursor(), 1); // 查看这个方法
    .......

  } else {
    // 输入的不是中文字符
    // 清空未成形的量词
    this.countHits.clear();
  }

  // 缓冲区数据已经读完，还有尚未输出的量词
  if (context.isBufferConsumed()) {
    // 清空未成形的量词
    this.countHits.clear();
  }
}

Dictionary类

/**
 * 检索匹配量词词典
 * @param charArray
 * @param begin
 * @param length
 * @return Hit 匹配结果描述
 */
public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
  return singleton._QuantifierDict.match(charArray, begin, length);
}

匹配前先看看量词词典是怎么储存字符的

private Dictionary(Configuration cfg) {
  this.cfg = cfg;
  this.loadMainDict();
  this.loadStopWordDict();
  this.loadQuantifierDict();// 这个方法加载了量词
}

/**
 * 加载量词词典
 */
private void loadQuantifierDict() {
  // 建立一个量词典实例
  _QuantifierDict = new DictSegment((char) 0);
  // 读取量词词典文件
  InputStream is = this.getClass().getClassLoader()
      .getResourceAsStream(cfg.getQuantifierDicionary());
  if (is == null) {
    throw new RuntimeException("Quantifier Dictionary not found!!!");
  }
  try {
    BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
    String theWord = null;
    do {
      theWord = br.readLine();
      if (theWord != null && !"".equals(theWord.trim())) {
        // 读到一行数据 写入量词词典  查看fillSegment方法
        _QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
      }
    } while (theWord != null);

  } catch (IOException ioe) {
    System.err.println("Quantifier Dictionary loading exception.");
    ioe.printStackTrace();

  } finally {
    try {
      if (is != null) {
        is.close();
        is = null;
      }
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
}

DictSegment类

/**
 * 加载填充词典片段
 * @param charArray
 */
void fillSegment(char[] charArray) {
  this.fillSegment(charArray, 0, charArray.length, 1);
}

private synchronized void fillSegment(char[] charArray, int begin, int length, int enabled) {
  // 获取字典表中的汉字对象
  Character beginChar = new Character(charArray[begin]);
  Character keyChar = charMap.get(beginChar);
  // 字典中没有该字，则将其添加入字典
  if (keyChar == null) {
    charMap.put(beginChar, beginChar);
    keyChar = beginChar;
  }

  // 搜索当前节点的存储，查询对应keyChar的keyChar，如果没有则创建
  DictSegment ds = lookforSegment(keyChar, enabled);
  if (ds != null) {
    // 处理keyChar对应的segment
    if (length > 1) {
      // 词元还没有完全加入词典树
      ds.fillSegment(charArray, begin + 1, length - 1, enabled);
    } else if (length == 1) {
      // 已经是词元的最后一个char,设置当前节点状态为enabled，
      // enabled=1表明一个完整的词，enabled=0表示从词典中屏蔽当前词
      ds.nodeState = enabled;
    }
  }

}

private DictSegment lookforSegment(Character keyChar, int create) {

  DictSegment ds = null;

  if (this.storeSize <= ARRAY_LENGTH_LIMIT) {
    // 获取数组容器，如果数组未创建则创建数组
    DictSegment[] segmentArray = getChildrenArray();
    // 搜寻数组
    DictSegment keySegment = new DictSegment(keyChar);
    int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment);
    if (position >= 0) {
      ds = segmentArray[position];
    }

    // 遍历数组后没有找到对应的segment
    if (ds == null && create == 1) {
      ds = keySegment;
      if (this.storeSize < ARRAY_LENGTH_LIMIT) {
        // 数组容量未满，使用数组存储
        segmentArray[this.storeSize] = ds;
        // segment数目+1
        this.storeSize++;
        Arrays.sort(segmentArray, 0, this.storeSize);

      } else {
        // 数组容量已满，切换Map存储
        // 获取Map容器，如果Map未创建,则创建Map
        Map<Character, DictSegment> segmentMap = getChildrenMap();
        // 将数组中的segment迁移到Map中
        migrate(segmentArray, segmentMap);
        // 存储新的segment
        segmentMap.put(keyChar, ds);
        // segment数目+1 ， 必须在释放数组前执行storeSize++ ， 确保极端情况下，不会取到空的数组
        this.storeSize++;
        // 释放当前的数组引用
        this.childrenArray = null;
      }

    }

  } else {
    // 获取Map容器，如果Map未创建,则创建Map
    Map<Character, DictSegment> segmentMap = getChildrenMap();
    // 搜索Map
    ds = (DictSegment) segmentMap.get(keyChar);
    if (ds == null && create == 1) {
      // 构造新的segment
      ds = new DictSegment(keyChar);
      segmentMap.put(keyChar, ds);
      // 当前节点存储segment数目+1
      this.storeSize++;
    }
  }

  return ds;
}

加载后的_QuantifierDict

// 公用字典表，存储汉字    保存所有量词字符 键和值是一样的 去重
private static final Map<Character, Character> charMap = new HashMap<Character, Character>(16,
    0.95f);
// 数组大小上限     
// 在storeSize > ARRAY_LENGTH_LIMIT 就将childrenArray中的DictSegment保存到childrenMap中
// 对于_QuantifierDict来说 storeSize一定是大于3的 所以childrenArray会被置为null
private static final int ARRAY_LENGTH_LIMIT = 3; 

// Map存储结构     键是 量词字符  值是nodeChar是量词字符的DictSegment
private Map<Character, DictSegment> childrenMap;
// 数组方式存储结构   
private DictSegment[] childrenArray; // 是null

// 当前节点上存储的字符  
private Character nodeChar; // ‘0’
// 当前节点存储的Segment数目   
// storeSize <=ARRAY_LENGTH_LIMIT ，使用数组存储， storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
private int storeSize = 0; // childrenMap的size 就是量词词典中保存词的数量
// 当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词 
private int nodeState = 0; // 对于_QuantifierDict好像没什么意义

childrenMap中一个字符对应的一个DictSegment

// 公用字典表，存储汉字 如果这个量词是一个字符 charMap没有保存字符
private static final Map<Character, Character> charMap = new HashMap<Character, Character>(16,
    0.95f);
// 数组大小上限
private static final int ARRAY_LENGTH_LIMIT = 3;

// Map存储结构
private Map<Character, DictSegment> childrenMap;
// 数组方式存储结构
private DictSegment[] childrenArray; // 两个字符的量词  第二个字符的DictSegment保存这里面

// 当前节点上存储的字符
private Character nodeChar;
// 当前节点存储的Segment数目
// storeSize <=ARRAY_LENGTH_LIMIT ，使用数组存储， storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
private int storeSize = 0; // 一个字符的量词为0  两个字符的量词为1
// 当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
private int nodeState = 0; // 一个字符的量词是1  两个字符的量词对于第一个字符的DictSegment是0

比如一个量词叫“xyz”

x的DictSegment保存在_QuantifierDict的childrenMap里

y的DictSegment保存在x的DictSegment中的childrenArray里

z的DictSegment保存在y的DictSegment中的childrenArray

了解量词数据是以什么样的数据结构保存了之后再去看match方法

/**
 * 检索匹配量词词典
 * @param charArray
 * @param begin
 * @param length
 * @return Hit 匹配结果描述
 */
public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
  return singleton._QuantifierDict.match(charArray, begin, length);
}

Hit match(char[] charArray, int begin, int length) {
  return this.match(charArray, begin, length, null);
}

// 这边先就看看量词的单字符匹配逻辑

Hit match(char[] charArray, int begin, int length, Hit searchHit) {

  if (searchHit == null) {
    // 如果hit为空，新建   
    searchHit = new Hit(); (1)  
    // 设置hit的其实文本位置  
    searchHit.setBegin(begin); (2)
  } else {
    // 否则要将HIT状态重置
    searchHit.setUnmatch();
  }
  // 设置hit的当前处理位置
  searchHit.setEnd(begin); (3)

  Character keyChar = new Character(charArray[begin]);
  DictSegment ds = null;

  // 引用实例变量为本地变量，避免查询时遇到更新的同步问题
  DictSegment[] segmentArray = this.childrenArray;
  Map<Character, DictSegment> segmentMap = this.childrenMap;

  // STEP1 在节点中查找keyChar对应的DictSegment
  if (segmentArray != null) {
    // 在数组中查找
    DictSegment keySegment = new DictSegment(keyChar);
    int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment);
    if (position >= 0) {
      ds = segmentArray[position];
    }

  } else if (segmentMap != null) {
    // 在map中查找
    ds = (DictSegment) segmentMap.get(keyChar); (4)
  }

  // STEP2 找到DictSegment，判断词的匹配状态，是否继续递归，还是返回结果
  if (ds != null) { (5)
    if (length > 1) {
      // 词未匹配完，继续往下搜索
      return ds.match(charArray, begin + 1, length - 1, searchHit);
    } else if (length == 1) { (6)      
      // 搜索最后一个char
      if (ds.nodeState == 1) { (7)
        // 添加HIT状态为完全匹配
        searchHit.setMatch();
      }
      if (ds.hasNextNode()) {
        // 添加HIT状态为前缀匹配
        searchHit.setPrefix();
        // 记录当前位置的DictSegment
        searchHit.setMatchedDictSegment(ds);
      }
      return searchHit; (8)
    }

  }
  // STEP3 没有找到DictSegment， 将HIT设置为不匹配
  return searchHit;
}

public class Hit {
  // Hit不匹配
  private static final int UNMATCH = 0x00000000;
  // Hit完全匹配
  private static final int MATCH = 0x00000001;
  // Hit前缀匹配
  private static final int PREFIX = 0x00000010;

  // 该HIT当前状态，默认未匹配
  private int hitState = UNMATCH;

  // 记录词典匹配过程中，当前匹配到的词典分支节点
  private DictSegment matchedDictSegment;
  /*
   * 词段开始位置
   */
  private int begin;
  /*
   * 词段的结束位置
   */
  private int end;
}

/**
 * 处理中文量词
 * @param context
 */
private void processCount(AnalyzeContext context) {
  // 判断是否需要启动量词扫描
  if (!this.needCountScan(context)) {
    return;
  }

  if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()) {

    // 优先处理countHits中的hit
    ......

    // *********************************
    // 对当前指针位置的字符进行单字匹配
    Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(),
      context.getCursor(), 1);
    if (singleCharHit.isMatch()) {// 首字成量词词
      // 输出当前的词
      Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1,
          Lexeme.TYPE_COUNT);
      context.addLexeme(newLexeme);// 匹配上了就在context的orgLexemes中添加这个词元

      // 同时也是词前缀
      if (singleCharHit.isPrefix()) {
        // 前缀匹配则放入hit列表
        this.countHits.add(singleCharHit);
      }
    } else if (singleCharHit.isPrefix()) {// 首字为量词前缀
      // 前缀匹配则放入hit列表
      this.countHits.add(singleCharHit);
    }

  } else {
    // 输入的不是中文字符
    // 清空未成形的量词
    this.countHits.clear();
  }

  // 缓冲区数据已经读完，还有尚未输出的量词
  if (context.isBufferConsumed()) {
    // 清空未成形的量词
    this.countHits.clear();
  }
}

// 例子：context中待匹配的字符串是abcdef   词典中有量词ab
// 在cursor(游标)指向a时，会将a的词典对象添加到countHit中
// 现在cursor指向b, countHit中有对象了
// 优先处理countHits中的hit
if (!this.countHits.isEmpty()) {
  // 处理词段队列
  Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
  for (Hit hit : tmpArray) {
    // 查看a的词典对象中的childrenArray中是否含有b字符的词典对象
    // b的nodeState是不是1，是的话就是完全匹配上
    // hit isMatch返回true
    hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(),
      context.getCursor(), hit);
    if (hit.isMatch()) {
      // 输出当前的词
      Lexeme newLexeme = new Lexeme(context.getBufferOffset(), hit.getBegin(),
          context.getCursor() - hit.getBegin() + 1, Lexeme.TYPE_COUNT);
      context.addLexeme(newLexeme); // 添加到content中的orgLexemes

      // 如果词典中还有一个量词叫abc 这边就不会调用remove方法
      // 因为b的词典对象 nodeState=1 表示ab是一个量词
      // 但是 storeSize 是大于0的  表示ab.也是个量词
      if (!hit.isPrefix()) {// 不是词前缀，hit不需要继续匹配，移除
        this.countHits.remove(hit);
      }

    } else if (hit.isUnmatch()) { // a的词典中没有b字符 移除这个hit
      // hit不是词，移除
      this.countHits.remove(hit);
    }
  }
}

回头在去看CJKSegmenter处理中文词的子分词器匹配过程基和量词匹配差不多只是词库不同而已

转载于:https://my.oschina.net/zhuqianli/blog/1583693