Lucene7 使用Analyzer 过滤中文停用字符
当使用Lucene做中文分词全文检索的时候,经常会出现一个问题,就是一些常用的助词,例如:地、得、的等,一些一元的Analyzer会把这些词当做检索的词源。如何去掉这些词源呢,我是使用HanLP作为中文分词库的,但当我使用HanLP的CustomDictionary.remove(“的”)和StopWordDictionary.add(“的”)的时候,发现并没有什么卵用,于是我用了以下办法:
1.创建一个HanLPAnalyzer的子类
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseTokenizer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import com.hankcs.lucene.HanLPAnalyzer;
public class StopAnalyzerExtend extends HanLPAnalyzer {
private CharArraySet stopWordSet;//停止词词典
public CharArraySet getStopWordSet() {
return this.stopWordSet;
}
public void setStopWordSet(CharArraySet stopWordSet) {
this.stopWordSet = stopWordSet;
}
public StopAnalyzerExtend() {
super();
setStopWordSet(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
/**
* @param stops 需要扩展的停止词
*/
public StopAnalyzerExtend(List<String> stops) {
this();
/**如果直接为stopWordSet赋值的话,会报如下异常,这是因为在StopAnalyzer中有ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet);
* ENGLISH_STOP_WORDS_SET 被设置为不可更改的set集合
*/
//stopWordSet = getStopWordSet();
stopWordSet = CharArraySet.copy(getStopWordSet());
stopWordSet.addAll(StopFilter.makeStopSet(stops));
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
TokenStreamComponents tokenStreamComponents=super.createComponents(fieldName);
Tokenizer tokenizer=tokenStreamComponents.getTokenizer();
return new TokenStreamComponents(tokenizer, new StopFilter(tokenizer, stopWordSet));
}
/*public static void main(String[] args) throws IOException {
ArrayList<String> strings = new ArrayList<String>() {{
add("的");
add("人");
}};
Analyzer analyzer = new StopAnalyzerExtend(strings);
String content = "我爱的人";
TokenStream tokenStream = analyzer.tokenStream("myfield", content);
tokenStream.reset();
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()) {
// 已经过滤掉自定义停用词
// 输出:playing together
System.out.println(charTermAttribute.toString());
}
tokenStream.end();
tokenStream.close();
}*/
}
2.调用该子类做Query
ArrayList<String> strings = new ArrayList<String>() {{
add("的");
add("得");
}};
Analyzer analyzer = new StopAnalyzerExtend(strings);
Map<String, Float> boostMap = new HashMap<String, Float>();
boostMap.put(UtilConstants.SearchResultFields.FILE_CONTENT,5.0f);
boostMap.put(UtilConstants.SearchResultFields.FILE_NAME,1.0f);
BooleanQuery.Builder builder=new BooleanQuery.Builder();
List<Query> queryList=new ArrayList<>();
String[] fields = {UtilConstants.SearchResultFields.FILE_NAME, UtilConstants.SearchResultFields.FILE_CONTENT};
QueryParser queryParserMut = new MultiFieldQueryParser(fields, analyzer,boostMap);
// 根据传进来的par查找
Query queryKeyword = queryParserMut.parse(keyword);