lucene5之后版本有了较大的改动,现将lucene5的同义词分词器改造代码和方式,记录一下
功能加测试的类一共6个,一一介绍一下
1 同义词分词器类SameWordAnalyzer
2 同义词过滤器类SameWordFilter
3 根据词语获取同义词引擎接口SameWordEngine
4 同义词引擎接口实现类SameWordEngineImpl
5 分词器分词分析工具类 AnalyzerUtils
6 结果测试类 TestUnit
使用的jar包如下
具体代码如下
1 SameWordAnalyzer类内容:
package com.liu.lucene.pro;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
public class SameWordAnalyzer extends Analyzer {
private SameWordEngine engine;
public SameWordAnalyzer(SameWordEngine engine){
this.engine = engine;
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
// TODO Auto-generated method stub
Tokenizer source = new StandardTokenizer();
TokenStream result = new SameWordFilter(source,engine);
return new TokenStreamComponents(source, result);
}
}
2 SameWordFilter类
package com.liu.lucene.pro;
import java.io.IOException;
import java.util.Stack;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;
public class SameWordFilter extends TokenFilter {
private SameWordEngine engine;
private Stack<String> samewordStack;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private AttributeSource.State current;
protected SameWordFilter(TokenStream input,SameWordEngine engine) {
super(input);
this.engine = engine;
samewordStack = new Stack<String>();
}
@Override
public boolean incrementToken() throws IOException {
if(samewordStack.size()>0){
String sameWord = samewordStack.pop();
this.restoreState(current);
//termAtt.copyBuffer(sameWord.toCharArray(), 0, sameWord.length());
termAtt.setEmpty();
termAtt.append(sameWord);
posIncrAtt.setPositionIncrement(0);
return true;
}
if(!input.incrementToken()){
return false;
}
if(isAddSameWord()){
current = this.captureState();
}
return true;
}
private boolean isAddSameWord() {
String[] sameWords = engine.getSameWords(termAtt.toString());
if(sameWords == null){
return false;
}
for(String sameWord:sameWords){
samewordStack.push(sameWord);
}
return true;
}
}
3 SameWordEngine引擎接口
package com.liu.lucene.pro;
public interface SameWordEngine {
String[] getSameWords(String str);
}
4 SameWordEngineImpl引擎接口实现类
package com.liu.lucene.pro;
import java.util.HashMap;
import java.util.Map;
public class SameWordEngineImpl implements SameWordEngine {
@Override
public String[] getSameWords(String str) {
// TODO Auto-generated method stub
Map<String,String[]> map = new HashMap<String,String[]>();
map.put("2015", new String[]{"二零一五","20一5"});
map.put("redis", new String[]{"内存数据库","re内存"});
return map.get(str);
}
}
5 分词器分析工具类AnalyzerUtils
package com.liu.lucene.pro;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
public class AnalyzerUtils {
public static void displayTokens(Analyzer analyzer,Reader reader){
try {
TokenStream tokenStream = analyzer.tokenStream("path", reader);
tokenStream.reset();
CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
while(tokenStream.incrementToken()){
System.out.print("["+term.toString()+"]");
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
6 测试类 TestUnit
package com.liu.lucene.test;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.Reader;
import org.junit.Before;
import org.junit.Test;
import com.liu.lucene.pro.AnalyzerUtils;
import com.liu.lucene.pro.LuceneIndex;
import com.liu.lucene.pro.SameWordAnalyzer;
import com.liu.lucene.pro.SameWordEngineImpl;
public class TestUnit {
LuceneIndex index = null;
@Before
public void setUp(){
index = new LuceneIndex();
}
@Test
public void testIndex(){
index.index(true);
}
@Test
public void testIndexAnalyzer(){
index.index(true,new SameWordAnalyzer(new SameWordEngineImpl()));
}
@Test
public void testSearch(){
index.search("20一5",new SameWordAnalyzer(new SameWordEngineImpl()));
}
@Test
public void testDisplayTokens(){
try {
Reader reader = new FileReader("D:\\lhl\\developSoft\\apache-tomcat-7.0.62-windows-x64\\apache-tomcat-7.0.62\\logs\\loginfo.log.2015-11-27.log");
AnalyzerUtils.displayTokens(new SameWordAnalyzer(new SameWordEngineImpl()), reader);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
}