首先我们应该都了解,lucene 组件是文档搜索的基本组件,今天我们先来了解一下luence组件分词器的一些知识,然后我们自定义一个简单的分词器。
首先我们明白几个主要的类:
Analyzer:为我们获取分词器,在此类中我们要复写它的抽象方法,以获得我们特殊的分词器,通过腹泻createComponent方法获取分词器对象
TokenStream : 分词器对象,方法incrementToken 方法判断是否还有分此项
TokenFilter : 分词过滤器
Tokenizer:分词处理器
Attribute:分词属性对象
在我们分词过程中,这几个类很重要,明白其中原理那么我们自定义分词器就轻而易举的实现。查看源码可以很清晰知道其原理,在这里不在附属,下面自定义一个空格分词并转小写的分词器:
package com.cai.analizer; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.AttributeReflector; import java.io.IOException; public class CaiAnalyzer extends Analyzer { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source =new MyWhiteSpaceTokenizer(); TokenStream filter = new MyWhileSpaceTokenFiler(source); return new TokenStreamComponents(source,filter); } //创建分词器:一个单词一个单词去读,有空格进行分词处理 static class MyWhiteSpaceTokenizer extends Tokenizer{ MyCharAttribute mychar = this.addAttribute(MyCharAttribute.class); //声明存储容器 char[] buffer = new char[255]; //声明获取元素 int c ; //声明容器长度 int length = 0; @Override public boolean incrementToken() throws IOException{ clearAttributes();//清楚所有熟悉ing length = 0; //循环获取字符 while (true){ c = this.input.read(); if(c == -1) {//读取完毕 if(length > 0){//判断是否为最后一个字符,如果不是则循环加入 this.mychar.setChars(buffer,length); return true; }else{ return false; } } if(Character.isWhitespace(c)){//判断是否为空白字符 if(length > 0){//说明不是第一个 this.mychar.setChars(buffer,length); return true; } } buffer[length++] = (char)c; } } } //创建分词过滤器 /** * */ static class MyWhileSpaceTokenFiler extends TokenFilter { public MyWhileSpaceTokenFiler(TokenStream source){ super(source); } MyCharAttribute myCharAttribute = this.addAttribute((MyCharAttribute.class)); @Override public boolean incrementToken() throws IOException{ boolean res = this.input.incrementToken(); if(res){//说明有分词 char[] chars = myCharAttribute.getChars(); int length = myCharAttribute.getLength(); if(length > 0){ for(int i=0;i<length;i++){ chars[i]= Character.toLowerCase(chars[i]); } } } return res; } } //定义属性对象 public interface MyCharAttribute extends Attribute{ void setChars(char[] buffer, int length);//设置字符 char[] getChars();//获取字符串 int getLength();//获取长度 String getString();//获取字符 } public static class MyCharAttributeImpl extends AttributeImpl implements MyCharAttribute{ //设置长度 private char[] charTerm = new char[255]; private int length =0; public MyCharAttributeImpl(){} @Override public void clear() { } @Override public void reflectWith(AttributeReflector reflector) { } @Override public void copyTo(AttributeImpl target) { } @Override public void setChars(char[] buffer, int length) { this.length=length; if(length > 0){ System.arraycopy(buffer,0,this.charTerm,0,buffer.length);//将字符拷贝到属性中 } } @Override public char[] getChars() { return this.charTerm; } @Override public int getLength() { return this.length; } @Override public String getString() { if(this.length > 0){ return new String(this.charTerm,0,length); } return null; } } //测试分词器 public static void main(String[] args){ String text = "I Love You , Baby ! How are you "; Analyzer analyzer = new CaiAnalyzer(); try { TokenStream tokenStream= analyzer.tokenStream("aa",text); MyCharAttribute myCharAttribute = tokenStream.addAttribute(CaiAnalyzer.MyCharAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()){ System.out.print(myCharAttribute.getString()+"|"); } tokenStream.end(); }catch (IOException e){ e.printStackTrace(); } } }