根据分词器的分词流程,我们可以自定义分词器,这里扩展停用词分词器。
package cn.liuys.lucene.util;
import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LetterTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.Version;
/**
* @author liuys
* 自定义同义词分词器
*/
public class MyStopAnalyzer extends Analyzer {
@SuppressWarnings("rawtypes")
private Set stops;
public MyStopAnalyzer(String[] sws) {
stops = StopFilter.makeStopSet(Version.LUCENE_35,sws, true);
//添加原来的停用词
stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
public MyStopAnalyzer() {
stops = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new StopFilter(Version.LUCENE_35,
new LowerCaseFilter(Version.LUCENE_35,
new LetterTokenizer(Version.LUCENE_35, reader)), stops);
}
}
@Test
public void test04(){
Analyzer a1 = new MyStopAnalyzer(new String[]{"i","you","hate"});
Analyzer a2 = new MyStopAnalyzer();
String str = "how are you thank you,I hate you";
AnalyzerUtil.dispalyToken(str, a1);
AnalyzerUtil.dispalyToken(str, a2);
}
中文分词器使用mmseg4j,使用非常简单,主要是需要指定词库的位置:
@Test
public void test05(){
Analyzer a = new MMSegAnalyzer(new File("F:\\stady\\JAVA\\other\\Lucene\\mmseg4j-1.8.5\\data"));
String str = "我来自中国重庆南川,我喜欢健身";
AnalyzerUtil.dispalyToken(str, a);
}