1、分词按照空格、横杠、点号进行拆分;
2、实现hi与hello的同义词查询功能;
3、实现hi与hello同义词的高亮显示;
MyAnalyzer实现代码:
public class MyAnalyzer extends Analyzer {
private int analyzerType;
public MyAnalyzer(int type) {
super();
analyzerType = type;
}
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
MyTokenizer tokenizer = new MyTokenizer(fieldName, reader, analyzerType);
return new TokenStreamComponents(tokenizer);
}
}
MyTokenizer实现代码:
public class MyTokenizer extends Tokenizer {
public class WordUnit{
WordUnit(String word, int start, int length){
this.word = word;
this.start = start;
this.length = length;
//System.out.println("\tWordUnit: " + word + "|" + start + "|" + length);
}
String word;
int start;
int length;
}
private int analyzerType;
private int endPosition;
private Iterator<WordUnit> it;
private ArrayList<WordUnit> words;
private final CharTermAttribute termAtt;
private final OffsetAttribute offsetAtt;
public MyTokenizer(String fieldName, Reader in, int type) {
super(in);
it = null;
endPosition = 0;
analyzerType = type;
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
addAttribute(PayloadAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
clearAttributes();
char[] inputBuf = new char[1024];
if(it == null) {
int bufSize = input.read(inputBuf);
if(bufSize <= 0) return false;
int beginIndex = 0;
int endIndex = 0;
words = new ArrayList<WordUnit>();
for(endIndex = 0; endIndex < bufSize; endIndex++) {
if(inputBuf[endIndex] != '-' && inputBuf[endIndex] != ' ' && inputBuf[endIndex] != '.') continue;
addWord(inputBuf, beginIndex, endIndex);
beginIndex = endIndex + 1;
}
addWord(inputBuf, beginIndex, endIndex);//add the last
if(words.isEmpty()) return false;
it = words.iterator();
}
if(it != null && it.hasNext()){
WordUnit word = it.next();
termAtt.append(word.word);
termAtt.setLength(word.word.length());
endPosition = word.start + word.length;
offsetAtt.setOffset(word.start, endPosition);
return true;
}
return false;
}
@Override
public void reset() throws IOException {
super.reset();
it = null;
endPosition = 0;
}
@Override
public final void end() {
int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset);
}
private void addWord(char[] inputBuf, int begin, int end){
if(end <= begin) return;
String word = new String(inputBuf, begin, end - begin);
words.add(new WordUnit(word, begin, end - begin));
if(analyzerType == 0 && word.equals("hi")) words.add(new WordUnit("hello", begin, 2));
if(analyzerType == 0 && word.equals("hello")) words.add(new WordUnit("hi", begin, 5));
}
}
索引的时候分词器类型:analyzerType=0;
搜索的时候分词器类型:analyzerType=1;
高亮的时候分词器类型:analyzerType=0;
搜索hello时的效果如下: