Lucene5中编写自定义同义词分词器(基于IK中文分词器)二

上一篇中我们使用自定义的同义词引擎和自定义的TokenFilter结合Ik分词器完成一个同义词分析器。其实lucene本身有同义词的相关支持,本篇就是用lucene自带的SynonymFilterFactory来完成和上一篇相同的同义词分词器。

package com.daelly.sample.lucene.analyzer.synonyms;

import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.synonym.SynonymFilterFactory;
import org.apache.lucene.analysis.util.ClasspathResourceLoader;
import org.apache.lucene.analysis.util.FilesystemResourceLoader;
import org.wltea.analyzer.lucene.IKTokenizer;

public class SynonymsAnalyzer extends Analyzer {
	
	private final String synonymsPath;
	
	public SynonymsAnalyzer(String synonymsPath) {
		if(synonymsPath==null || synonymsPath.isEmpty()) {
			throw new IllegalArgumentException("synonymsPath must be provided!");
		}
		this.synonymsPath = synonymsPath;
	}

	@Override
	protected TokenStreamComponents createComponents(String fieldName) {
		SynonymFilterFactory factory = null;
		try {
			factory = getSynonymFilterFactory();
		} catch (IOException e) {
			e.printStackTrace();
		}
		Tokenizer tokenizer = new IKTokenizer(true);
		if(factory != null) {
			TokenStream tokenStream = factory.create(tokenizer);
			return new TokenStreamComponents(tokenizer,tokenStream);
		}
		return new TokenStreamComponents(tokenizer);
	}
	
	private SynonymFilterFactory getSynonymFilterFactory() throws IOException {
		if(synonymsPath.contains("classpath:")) {
			String path = synonymsPath.replace("classpath:", "");
			Map<String, String> args = new HashMap<>();
			args.put("synonyms", path);
			SynonymFilterFactory factory = new SynonymFilterFactory(args );
			factory.inform(new ClasspathResourceLoader());
			return factory;
		}
		int index = synonymsPath.lastIndexOf(File.separator);
		String dir = synonymsPath.substring(0,index);
		String name = synonymsPath.substring(index+1);
		Map<String, String> args = new HashMap<>();
		args.put("synonyms", name);
		SynonymFilterFactory factory = new SynonymFilterFactory(args);
		Path baseDirectory = Paths.get(dir);
		FilesystemResourceLoader loader = new FilesystemResourceLoader(baseDirectory);
		factory.inform(loader);
		return factory;
	}

}

编写测试类

package com.daelly.sample.lucene;

import static org.junit.Assert.assertEquals;

import java.io.IOException;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.junit.Before;
import org.junit.Test;

import com.daelly.sample.lucene.analyzer.synonyms.SynonymsAnalyzer;

public class SynonymAnalyzerTest {

	Directory dir;

	@Before
	public void setUp() throws Exception {
		dir = new RAMDirectory();
		IndexWriterConfig conf = new IndexWriterConfig(new SynonymsAnalyzer("C:\\Users\\Administrator\\Desktop\\synonyms.txt"));
		IndexWriter writer = new IndexWriter(dir, conf );
		Document doc = new Document();
		doc.add(new TextField("content", "我来自中国广州", Field.Store.YES));
		writer.addDocument(doc);
		writer.close();
	}
	
	@Test
	public void test1() throws IOException {
		Term term = new Term("content", "天朝");
		Query query = new TermQuery(term);
		IndexReader reader = DirectoryReader.open(dir);
		IndexSearcher searcher = new IndexSearcher(reader);
		TopDocs docs = searcher.search(query, 10);
		assertEquals(1, docs.totalHits);
	}
	
	@Test
	public void test2() throws IOException {
		Term term = new Term("content", "俺");
		Query query = new TermQuery(term);
		IndexReader reader = DirectoryReader.open(dir);
		IndexSearcher searcher = new IndexSearcher(reader);
		TopDocs docs = searcher.search(query, 10);
		assertEquals(1, docs.totalHits);
	}
	
	@Test
	public void test3() throws IOException {
		Term term = new Term("content", "五羊城");
		Query query = new TermQuery(term);
		IndexReader reader = DirectoryReader.open(dir);
		IndexSearcher searcher = new IndexSearcher(reader);
		TopDocs docs = searcher.search(query, 10);
		assertEquals(1, docs.totalHits);
	}
}

得到的结果和之前的一样,这里要注意synonyms.txt的格式是:

我,俺,咱
中国,天朝
广州,五羊城


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值