编写一个专门获取同义词的引擎:
package com.daelly.sample.lucene.analyzer.custom;
import java.io.IOException;
public interface SynonymEngine {
String[] getSynonyms(String s) throws IOException;
}
package com.daelly.sample.lucene.analyzer.custom;
import java.io.IOException;
import java.util.HashMap;
public class SimpleSynonymEngine implements SynonymEngine {
private static HashMap<String, String[]> map = new HashMap<>();
{
map.put("我", new String[]{"俺","咱"});
map.put("中国", new String[]{"天朝"});
map.put("广州", new String[]{"五羊城"});
}
@Override
public String[] getSynonyms(String s) throws IOException {
return map.get(s);
}
}
package com.daelly.sample.lucene.analyzer.custom;
import java.io.IOException;
import java.util.Stack;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;
public class SynonymFilter extends TokenFilter {
public static final String TOKEN_TYPE_SYNONYM = "SYNONYM";
private final Stack<String> synonymStack;
private SynonymEngine synonymEngine;
private AttributeSource.State current;
private final CharTermAttribute termAttr;
private final PositionIncrementAttribute posIncrAttr;
protected SynonymFilter(TokenStream input, SynonymEngine engine) {
super(input);
synonymStack = new Stack<>();
this.synonymEngine = engine;
this.termAttr = addAttribute(CharTermAttribute.class);
this.posIncrAttr = addAttribute(PositionIncrementAttribute.class);
}
@Override
public boolean incrementToken() throws IOException {
if(!synonymStack.isEmpty()) {
String syn = synonymStack.pop();
restoreState(current);
//这是4.x的写法
//termAttr.setTermBuffer(syn);
//这是5.x的写法
termAttr.copyBuffer(syn.toCharArray(), 0, syn.length());
posIncrAttr.setPositionIncrement(0);
return true;
}
if(!input.incrementToken()) {
return false;
}
//当前的token有同义词,将当前token的状态记录下来
//调到下一个token的时候恢复这个状态,将同义词插入到位置偏移量为0的上个位置
//为什么要到下一个token再处理我也不是很懂
if(addAliasesToStack()) {
current = captureState();
}
return true;
}
/*
*
*/
private boolean addAliasesToStack() throws IOException {
String termVal = termAttr.toString();
String[] synonyms = synonymEngine.getSynonyms(termVal);
if(synonyms == null) {
return false;
}
for (String synonym : synonyms) {
synonymStack.push(synonym);
}
return true;
}
}
package com.daelly.sample.lucene.analyzer.custom;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.wltea.analyzer.lucene.IKTokenizer;
public class SynonymAnalyzer extends Analyzer {
private final SynonymEngine engine;
public SynonymAnalyzer(SynonymEngine engine) {
this.engine = engine;
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new IKTokenizer(true);
TokenStream tokenStream = new SynonymFilter(tokenizer, engine);
return new TokenStreamComponents(tokenizer,tokenStream);
}
}
编写测试:
package com.daelly.sample.lucene;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.junit.Before;
import org.junit.Test;
import com.daelly.sample.lucene.analyzer.custom.SimpleSynonymEngine;
import com.daelly.sample.lucene.analyzer.custom.SynonymAnalyzer;
public class CommonAnalyzerTest {
Directory dir;
@Before
public void setUp() throws Exception {
dir = new RAMDirectory();
IndexWriterConfig conf = new IndexWriterConfig(new SynonymAnalyzer(new SimpleSynonymEngine()));
IndexWriter writer = new IndexWriter(dir, conf );
Document doc = new Document();
doc.add(new TextField("content", "我来自中国广州", Field.Store.YES));
writer.addDocument(doc);
writer.close();
}
@Test
public void test1() throws IOException {
Term term = new Term("content", "天朝");
Query query = new TermQuery(term);
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
TopDocs docs = searcher.search(query, 10);
assertEquals(1, docs.totalHits);
}
@Test
public void test2() throws IOException {
Term term = new Term("content", "俺");
Query query = new TermQuery(term);
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
TopDocs docs = searcher.search(query, 10);
assertEquals(1, docs.totalHits);
}
@Test
public void test3() throws IOException {
Term term = new Term("content", "五羊城");
Query query = new TermQuery(term);
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
TopDocs docs = searcher.search(query, 10);
assertEquals(1, docs.totalHits);
}
}
测试结果,同义词分词器生效。