上一篇中我们使用自定义的同义词引擎和自定义的TokenFilter结合Ik分词器完成一个同义词分析器。其实lucene本身有同义词的相关支持,本篇就是用lucene自带的SynonymFilterFactory来完成和上一篇相同的同义词分词器。
package com.daelly.sample.lucene.analyzer.synonyms;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.synonym.SynonymFilterFactory;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.FilesystemResourceLoader;
import org.wltea.analyzer.lucene.IKTokenizer;
public class SynonymsAnalyzer extends Analyzer {
private final String synonymsPath;
public SynonymsAnalyzer(String synonymsPath) {
if(synonymsPath==null || synonymsPath.isEmpty()) {
throw new IllegalArgumentException("synonymsPath must be provided!");
}
this.synonymsPath = synonymsPath;
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
SynonymFilterFactory factory = null;
try {
factory = getSynonymFilterFactory();
} catch (IOException e) {
e.printStackTrace();
}
Tokenizer tokenizer = new IKTokenizer(true);
if(factory != null) {
TokenStream tokenStream = factory.create(tokenizer);
return new TokenStreamComponents(tokenizer,tokenStream);
}
return new TokenStreamComponents(tokenizer);
}
private SynonymFilterFactory getSynonymFilterFactory() throws IOException {
if(synonymsPath.contains("classpath:")) {
String path = synonymsPath.replace("classpath:", "");
Map<String, String> args = new HashMap<>();
args.put("synonyms", path);
SynonymFilterFactory factory = new SynonymFilterFactory(args );
factory.inform(new ClasspathResourceLoader());
return factory;
}
int index = synonymsPath.lastIndexOf(File.separator);
String dir = synonymsPath.substring(0,index);
String name = synonymsPath.substring(index+1);
Map<String, String> args = new HashMap<>();
args.put("synonyms", name);
SynonymFilterFactory factory = new SynonymFilterFactory(args);
Path baseDirectory = Paths.get(dir);
FilesystemResourceLoader loader = new FilesystemResourceLoader(baseDirectory);
factory.inform(loader);
return factory;
}
}
编写测试类
package com.daelly.sample.lucene;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.junit.Before;
import org.junit.Test;
import com.daelly.sample.lucene.analyzer.synonyms.SynonymsAnalyzer;
public class SynonymAnalyzerTest {
Directory dir;
@Before
public void setUp() throws Exception {
dir = new RAMDirectory();
IndexWriterConfig conf = new IndexWriterConfig(new SynonymsAnalyzer("C:\\Users\\Administrator\\Desktop\\synonyms.txt"));
IndexWriter writer = new IndexWriter(dir, conf );
Document doc = new Document();
doc.add(new TextField("content", "我来自中国广州", Field.Store.YES));
writer.addDocument(doc);
writer.close();
}
@Test
public void test1() throws IOException {
Term term = new Term("content", "天朝");
Query query = new TermQuery(term);
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
TopDocs docs = searcher.search(query, 10);
assertEquals(1, docs.totalHits);
}
@Test
public void test2() throws IOException {
Term term = new Term("content", "俺");
Query query = new TermQuery(term);
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
TopDocs docs = searcher.search(query, 10);
assertEquals(1, docs.totalHits);
}
@Test
public void test3() throws IOException {
Term term = new Term("content", "五羊城");
Query query = new TermQuery(term);
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
TopDocs docs = searcher.search(query, 10);
assertEquals(1, docs.totalHits);
}
}
得到的结果和之前的一样,这里要注意synonyms.txt的格式是:
我,俺,咱
中国,天朝
广州,五羊城