最近要弄商品搜索近义词功能, 比如输入商汞 要搜到混泥土,我用的是solr4.2版本,网上转了一圈都没有我要的方案,也没有现成的类,
于是找到 lucene TokenFilterFactory类
但我用的是IKAnalyzer2012FF_u1版本 其中分词只有
IKAnalyzer.class这个类,而近义词已工厂方式去处理,故在org.wltea.analyzer.lucene 包下加了个 IKTokenizerFactory主要是已工厂方式去处理
代码如下
package org.wltea.analyzer.lucene;
import java.io.Reader;
import java.util.Map;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
public class IKTokenizerFactory extends TokenizerFactory {
private boolean isMaxWordLength = false;
public void init(Map<String, String> args) {
String _arg = (String) args.get("isMaxWordLength");
this.isMaxWordLength = Boolean.parseBoolean(_arg);
}
public Tokenizer create(Reader reader) {
return new IKTokenizer(reader, isMaxWordLength());
}
public void setMaxWordLength(boolean isMaxWordLength) {
this.isMaxWordLength = isMaxWordLength;
}
public boolean isMaxWordLength() {
return this.isMaxWordLength;
}
}
编译后放到org.wltea.analyzer.lucene 包下
以前我只能这样定义ik分词 但是不能实现近义词功能
<fieldType name="text_ik" class="solr.TextField">
<analyzer type="index" isMaxWordLength="false" class="org.wltea.analyzer.lucene.IKAnalyzer"/>
<analyzer type="query" isMaxWordLength="true" class="org.wltea.analyzer.lucene.IKAnalyzer"/>
</fieldType>
加入IKTokenizerFactory类后 改造成(可以在solr 核 下的conf 下 synonyms.txt 文件配置自己的近义词库)
<fieldType name="text_ik" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="org.wltea.analyzer.lucene.IKTokenizerFactory" />
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="org.wltea.analyzer.lucene.IKTokenizerFactory" />
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false" />
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>