自定义Analyzer实现扩展停用词
- 继承自Analyzer并覆写createComponents(String)方法
- 维护自己的停用词词典
- 重写TokenStreamComponents,选择合适的过滤策略
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.util.ArrayList;
import java.util.List;
/**
* Created by kangz on 2016/12/16.
* 自定义Analyzer实现扩展停用词
*/
public class MyAnalzer extends Analyzer {
public MyAnalzer() {
super();
setStopWordSet(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
private CharArraySet stopWordSet;//停止词词典
public MyAnalzer(List<String> stops) {
this();
//stopWordSet = getStopWordSet(); //如果直接这么调用的话,则并没有什么效果
stopWordSet = CharArraySet.copy(getStopWordSet());//执行过滤分析
stopWordSet.addAll(StopFilter.makeStopSet(stops));
}
public CharArraySet getStopWordSet() {
return stopWordSet;
}
public void setStopWordSet(CharArraySet stopWordSet) {
this.stopWordSet = stopWordSet;
}
@Override
protected TokenStreamComponents createComponents(String s) {
Tokenizer source = new LowerCaseTokenizer();
return new TokenStreamComponents(source, new StopFilter(source, stopWordSet));
}
public static void main(String[] args) throws Exception {
// 将所要过滤的词 通过List进行赋值
ArrayList<String> strings = new ArrayList<String>() {{
add("小鬼子");
add("美国佬");
add("红毛鬼");
}};
Analyzer analyzer = new MyAnalzer(strings);
String content = "小鬼子 and 美国佬 and 红毛鬼 are Playing Together!";
TokenStream tokenStream = analyzer.tokenStream("myfield", content);
tokenStream.reset();
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()) {
// 已经过滤掉自定义停用词
// 输出:playing together
System.out.println(charTermAttribute.toString());
}
tokenStream.end();
tokenStream.close();
}
}
参考资料http://codepub.cn/2016/05/23/Lucene-6-0-in-action-4-The-text-analyzer/
下面是小编的微信转帐二维码,小编再次谢谢读者的支持,小编会更努力的
----请看下方↓↓↓↓↓↓↓
百度搜索 Drools从入门到精通:可下载开源全套Drools教程
深度Drools教程不段更新中:
更多Drools实战陆续发布中………
扫描下方二维码关注公众号 ↓↓↓↓↓↓↓↓↓↓