ikanalyze分词学习(一)

最新推荐文章于 2023-02-01 11:47:50 发布

iteye_11910

最新推荐文章于 2023-02-01 11:47:50 发布

阅读量202

点赞数

文章标签： ikanalyze分词

本文链接：https://blog.csdn.net/iteye_11910/article/details/82513921

版权

===================IKAnalyzerTest=================

@Test

public void testIKAnalyzer() throws Exception {

String keyWord = "我们参加中国世锦赛";

System.out.println(keyWord);

IKAnalyzer analyzer = new IKAnalyzer();

// 使用智能分词

analyzer.setUseSmart(true);

// 打印分词结果

System.out.println("当前使用的分词器：" + analyzer.getClass().getSimpleName());

IKTokenizer tokenStream = (IKTokenizer) analyzer.tokenStream("content", new StringReader(keyWord));

CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class);

while (tokenStream.incrementToken()){

AnalyzeContext analyze = tokenStream.get_IKImplement().getContext();

LinkedList<Lexeme> xx = analyze.getResults();

for(Lexeme x:xx){

System.out.println("begin:"+x.getBegin()+",length:"+x.getLength());

}

System.out.println("end:"+term.toString());

}

==================IKTokenizer======================

@Override

public boolean incrementToken() throws IOException {

//清除所有的词元属性

clearAttributes();

Lexeme nextLexeme = _IKImplement.next();//获取词元

if(nextLexeme != null){

//将Lexeme转成Attributes

System.out.println("截取出来的词元:"+nextLexeme.getLexemeText());

//设置词元文本

termAtt.append(nextLexeme.getLexemeText());

//设置词元长度

termAtt.setLength(nextLexeme.getLength());

//设置词元位移

offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());

//记录分词的最后位置

endPosition = nextLexeme.getEndPosition();

//记录词元分类

typeAtt.setType(nextLexeme.getLexemeTypeString());

//返会true告知还有下个词元

return true;

}

//返会false告知词元输出完毕

return false;

}

==============IKSegmenter================

private AnalyzeContext context; //分词偏移量等都存放在这里

/**

* 分词，获取下一个词元

* @return Lexeme 词元对象

* @throws IOException

public synchronized Lexeme next()throws IOException{

Lexeme l = null;

while((l = context.getNextLexeme()) == null ){

* 从reader中读取数据，填充buffer

* 如果reader是分次读入buffer的，那么buffer要进行移位处理

* 移位处理上次读入的但未处理的数据

int available = context.fillBuffer(this.input);

if(available <= 0){

//reader已经读完

context.reset();

return null;

}else{

//初始化指针

context.initCursor();

do{

System.out.println("当前词:"+context.getSegmentBuff()[context.getCursor()]);

//遍历子分词器

for(ISegmenter segmenter : segmenters){

segmenter.analyze(context);//CJKSegmenter 分词器

}

//字符缓冲区接近读完，需要读入新的字符

if(context.needRefillBuffer()){

break;

}

}while(context.moveCursor()); //向前移动指针

//重置子分词器，为下轮循环进行初始化

for(ISegmenter segmenter : segmenters){

segmenter.reset();

}

//对分词进行歧义处理

this.arbitrator.process(context, this.cfg.useSmart());

//将分词结果输出到结果集，并处理未切分的单个CJK字符

context.outputToResult();

//记录本次分词的缓冲区位移

context.markBufferOffset();

}

return l;

}

=================CJKSegmenter================

public void analyze(AnalyzeContext context) {

if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){

//优先处理tmpHits中的hit

if(!this.tmpHits.isEmpty()){

//处理词段队列

Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);

for(Hit hit : tmpArray){

System.out.println("位置["+hit.getBegin()+","+hit.getEnd()+"],值"+curValue(context,hit));

hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);

if(hit.isMatch()){

//输出当前的词

Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);

context.addLexeme(newLexeme);

if(!hit.isPrefix()){//不是词前缀，hit不需要继续匹配，移除

this.tmpHits.remove(hit);

}

}else if(hit.isUnmatch()){

//hit不是词，移除

this.tmpHits.remove(hit);

}

//*********************************

//再对当前指针位置的字符进行单字匹配

//System.out.println("当前词:"+context.getSegmentBuff()[context.getCursor()]);

Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);

if(singleCharHit.isMatch()){//首字成词

//输出当前的词

Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);

context.addLexeme(newLexeme);

//同时也是词前缀

if(singleCharHit.isPrefix()){

//前缀匹配则放入hit列表

this.tmpHits.add(singleCharHit);

}

}else if(singleCharHit.isPrefix()){//首字为词前缀

//前缀匹配则放入hit列表

this.tmpHits.add(singleCharHit);

}

}else{

//遇到CHAR_USELESS字符

//清空队列

this.tmpHits.clear();

}

//判断缓冲区是否已经读完

if(context.isBufferConsumed()){

//清空队列

this.tmpHits.clear();

}

//判断是否锁定缓冲区

if(this.tmpHits.size() == 0){

context.unlockBuffer(SEGMENTER_NAME);

}else{

context.lockBuffer(SEGMENTER_NAME);

}

iteye_11910

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫