源码分析ik分词主流程

最新推荐文章于 2024-04-21 20:01:31 发布

thriller_zw

最新推荐文章于 2024-04-21 20:01:31 发布

阅读量182

点赞数

分类专栏：搜索文章标签：数据结构与算法

搜索专栏收录该内容

10 篇文章 0 订阅

订阅专栏

1、首先内存中建立词典树。

包括：主词典树、停止词词典树、量词词典树

数据结构：树（或说字典树），子节点<=3时，数组存储DictSegment[] childrenArray; >3时迁移到hashMap Map<Character , DictSegment> childrenMap;

根：DictSegment _MainDict = new DictSegment((char)0);

字典树中的每个节点用DictSegmenter表示，每个节点的下一级节点分支使用Array或者Map来存储

主词典的子节点（也是子节点最多的节点）

segmentMapHashMap<K,V> (id=76)

entrySetHashMap$EntrySet (id=81)

keySetnull

loadFactor0.8

modCount5860

size5860

tableHashMap$Entry<K,V>[8192] (id=85)

threshold6553

valuesnull

/**
 * 词典管理类,单例模式
 */
public class Dictionary {
	/*
	 * 词典单子实例
	 */
	private static Dictionary singleton;
	
	/*
	 * 主词典对象
	 */
	private DictSegment _MainDict;
	
	/*
	 * 停止词词典 
	 */
	private DictSegment _StopWordDict;
	/*
	 * 量词词典
	 */
	private DictSegment _QuantifierDict;
	
	/**
	 * 配置对象
	 */
	private Configuration cfg;
	
	。。。
	}

2、遍历子分词器分词

对每个字符分别让三个分词器处理，如果符合字符类型等条件，将匹配到的词元加入结果集。

依次遍历三个分词器：LetterSegmenter(字母分词)，CN_QuantifierSegmenter(数、量词分词)，CJKSegmenter(中文分词)

CN_QuantifierSegmenter分出中文数字”一“，量词”个“

LetterSegmenter：连续的英文字母作为一个词元,如：can

第一次调用l = context.getNextLexeme()) == null，会分词，取词时候不为空。

/**
	 * 分词，获取下一个词元
	 * @return Lexeme 词元对象
	 * @throws IOException
	 */
	public synchronized Lexeme next()throws IOException{
		Lexeme l = null;
		while((l = context.getNextLexeme()) == null ){
			/*
			 * 从reader中读取数据，填充buffer
			 * 如果reader是分次读入buffer的，那么buffer要  进行移位处理
			 * 移位处理上次读入的但未处理的数据
			 */
			int available = context.fillBuffer(this.input);
			if(available <= 0){
				//reader已经读完
				context.reset();
				return null;
				
			}else{
				//初始化指针
				context.initCursor();
				do{
        			//遍历子分词器
        			for(ISegmenter segmenter : segmenters){
        				segmenter.analyze(context);
        			}
        			//字符缓冲区接近读完，需要读入新的字符
        			if(context.needRefillBuffer()){
        				break;
        			}
   				//向前移动指针
				}while(context.moveCursor());
				//重置子分词器，为下轮循环进行初始化
				for(ISegmenter segmenter : segmenters){
					segmenter.reset();
				}
			}
			//对分词进行歧义处理
			this.arbitrator.process(context, this.cfg.useSmart());			
			//将分词结果输出到结果集，并处理未切分的单个CJK字符
			context.outputToResult();
			//记录本次分词的缓冲区位移
			context.markBufferOffset();			
		}
		return l;
	}

3、中文分词器analyz函数。

从主词典树查到第一个字符节点DictSegment，如果是完全匹配（单字词）加入结果集，如果是词的前缀，加入临时命中集tmpHits，下一个字符优先从tmpHits中匹配。

如果完全匹配（节点没有叶子），把词（路径）加入结果集，如果不匹配，从tmpHits移除。tmpHits匹配完后，还得去主词典再查是否为单字词或下一个词的前缀。

public void analyze(AnalyzeContext context) {
		
		if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){ //zw:当前字符类型不是 无类型
			
			//优先处理tmpHits中的hit
			if(!this.tmpHits.isEmpty()){
				//处理词段队列
				Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
				for(Hit hit : tmpArray){
					hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
					if(hit.isMatch()){
						//输出当前的词
						Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
						context.addLexeme(newLexeme);
						
						if(!hit.isPrefix()){//不是词前缀，hit不需要继续匹配，移除
							this.tmpHits.remove(hit);
						}
						
					}else if(hit.isUnmatch()){
						//hit不是词，移除
						this.tmpHits.remove(hit);
					}					
				}
			}			
			
			//*********************************
			//再对当前指针位置的字符进行单字匹配
			Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1); //zw:主词典匹配singleton._MainDict
			if(singleCharHit.isMatch()){//首字成词
				//输出当前的词
				Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);
				context.addLexeme(newLexeme);

				//同时也是词前缀
				if(singleCharHit.isPrefix()){
					//前缀匹配则放入hit列表
					this.tmpHits.add(singleCharHit);
				}
			}else if(singleCharHit.isPrefix()){//首字为词前缀
				//前缀匹配则放入hit列表
				this.tmpHits.add(singleCharHit);
			}
			

		}else{
			//遇到CHAR_USELESS字符
			//清空队列
			this.tmpHits.clear();
		}
		
		//判断缓冲区是否已经读完
		if(context.isBufferConsumed()){
			//清空队列
			this.tmpHits.clear();
		}
		
		//判断是否锁定缓冲区
		if(this.tmpHits.size() == 0){
			context.unlockBuffer(SEGMENTER_NAME);
			
		}else{
			context.lockBuffer(SEGMENTER_NAME);
		}
	}
	/**
 * 表示一次词典匹配的命中
 */
public class Hit {
	//Hit不匹配
	private static final int UNMATCH = 0x00000000;
	//Hit完全匹配
	private static final int MATCH = 0x00000001;
	//Hit前缀匹配
	private static final int PREFIX = 0x00000010;
	
	
	//该HIT当前状态，默认未匹配
	private int hitState = UNMATCH;
	
	//记录词典匹配过程中，当前匹配到的词典分支节点
	private DictSegment matchedDictSegment; 
	/*
	 * 词段开始位置
	 */
	private int begin;
	/*
	 * 词段的结束位置
	 */
	private int end;
	
	。。。
	}
	

**
 * IK词元对象 
 */
public class Lexeme implements Comparable<Lexeme>{
	//lexemeType常量
	//未知
	public static final int TYPE_UNKNOWN = 0;
	//英文
	public static final int TYPE_ENGLISH = 1;
	//数字
	public static final int TYPE_ARABIC = 2;
	//英文数字混合
	public static final int TYPE_LETTER = 3;
	//中文词元
	public static final int TYPE_CNWORD = 4;
	//中文单字
	public static final int TYPE_CNCHAR = 64;
	//日韩文字
	public static final int TYPE_OTHER_CJK = 8;
	//中文数词
	public static final int TYPE_CNUM = 16;
	//中文量词
	public static final int TYPE_COUNT = 32;
	//中文数量词
	public static final int TYPE_CQUAN = 48;
	
	//词元的起始位移
	private int offset;
       //词元的相对起始位置
       private int begin;
       //词元的长度
       private int length;
       //词元文本
       private String lexemeText;
       //词元类型
       private int lexemeType;
       。。。
   }

4、匹配到后向链表集合添加词元

/**
	 * 向链表集合添加词元
	 * @param lexeme
	 */
	boolean addLexeme(Lexeme lexeme){
		Cell newCell = new Cell(lexeme); 
		if(this.size == 0){
			this.head = newCell;
			this.tail = newCell;
			this.size++;
			return true;
			
		}else{
			if(this.tail.compareTo(newCell) == 0){//词元与尾部词元相同，不放入集合
				return false;
				
			}else if(this.tail.compareTo(newCell) < 0){//词元接入链表尾部
				this.tail.next = newCell;
				newCell.prev = this.tail;
				this.tail = newCell;
				this.size++;
				return true;
				
			}else if(this.head.compareTo(newCell) > 0){//词元接入链表头部
				this.head.prev = newCell;
				newCell.next = this.head;
				this.head = newCell;
				this.size++;
				return true;
				
			}else{					
				//从尾部上逆
				Cell index = this.tail;
				while(index != null && index.compareTo(newCell) > 0){
					index = index.prev;
				}
				if(index.compareTo(newCell) == 0){//词元与集合中的词元重复，不放入集合
					return false;
					
				}else if(index.compareTo(newCell) < 0){//词元插入链表中的某个位置
					newCell.prev = index;
					newCell.next = index.next;
					index.next.prev = newCell;
					index.next = newCell;
					this.size++;
					return true;					
				}
			}
		}
		return false;
	}


//分词结果集数据结构	
/**
 * IK分词器专用的Lexem快速排序集合
 */
class QuickSortSet {
	//链表头
	private Cell head;
	//链表尾
	private Cell tail;
	//链表的实际大小
	private int size;
	//内部类，QuickSortSet集合单元
	class Cell implements Comparable<Cell>{
		private Cell prev;
		private Cell next;
		private Lexeme lexeme;
		。。。
	 }
		。。。
	}

5、取词

过滤停止词，设置词元文本。

/**
	 * 返回lexeme 
	 * 
	 * 同时处理合并
	 * @return
	 */
	Lexeme getNextLexeme(){
		//从结果集取出，并移除第一个Lexme
		Lexeme result = this.results.pollFirst();
		while(result != null){
    		//数量词合并
    		this.compound(result);
    		if(Dictionary.getSingleton().isStopWord(this.segmentBuff ,  result.getBegin() , result.getLength())){
       			//是停止词继续取列表的下一个
    			result = this.results.pollFirst(); 				
    		}else{
	 			//不是停止词, 生成lexeme的词元文本,输出
	    		result.setLexemeText(String.valueOf(segmentBuff , result.getBegin() , result.getLength()));
	    		break;
    		}
		}
		return result;
	}

6、例子

对"这是一个中文分词的例子，IKAnalyer can analysis english text too"分词结果如下：

0 - 2 : 这是 | CN_WORD //0表示词的开始位置，2表示结束位置，这是表示词的内容，CN_WORD表示中文

2 - 4 : 一个 | CN_WORD

2 - 3 : 一 | TYPE_CNUM //TYPE_CNUM 表示中文数词

3 - 5 : 个中 | CN_WORD

3 - 4 : 个 | COUNT //COUNT 表示中文量词

4 - 6 : 中文 | CN_WORD

6 - 8 : 分词 | CN_WORD

9 - 11 : 例子 | CN_WORD

12 - 21 : ikanalyer | ENGLISH

22 - 25 : can | ENGLISH