package com.yuan;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.synonym.SynonymFilterFactory;
import org.apache.lucene.analysis.util.FilesystemResourceLoader;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.wltea.analyzer.lucene.IKTokenizer;
public class SynonymAnalyzer extends Analyzer {
private Version version = Version.LUCENE_47;
public SynonymAnalyzer(Version version){
this.version = version;
}
public boolean useSmart() {
return useSmart;
}
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
public SynonymAnalyzer() {
this(false);
}
public SynonymAnalyzer(boolean useSmart) {
this.useSmart = useSmart;
}
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
// TODO Auto-generated method stub
Tokenizer tokenizer = new IKTokenizer(reader,
useSmart());
Map<String, String> filterArgs = new HashMap<String, String>();
filterArgs.put("luceneMatchVersion", version.toString());
filterArgs.put("synonyms", "c:/同义词/synonymword.dic");
filterArgs.put("expand", "true");
SynonymFilterFactory factory = new SynonymFilterFactory(filterArgs);
try {
factory.inform(new FilesystemResourceLoader());
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
IKAnalyzer ikAnalyzer = new IKAnalyzer();
TokenStream token = null;
try {
token = factory.create(ikAnalyzer.tokenStream("someField", reader));
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return new org.apache.lucene.analysis.Analyzer.TokenStreamComponents(
tokenizer,token);
}
private boolean useSmart;
}
本类主要根据IK分词器和SmartchinseAnalyzer缩写的。
最后在网上查看资料有进行了一下改变:
package com.yuan;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.synonym.SynonymFilterFactory;
import org.apache.lucene.analysis.util.FilesystemResourceLoader;
import org.wltea.analyzer.lucene.IKTokenizer;
public class IKSynonymsAnalyzer extends Analyzer {
@Override
protected TokenStreamComponents createComponents(String arg0, Reader arg1) {
Tokenizer token=new IKTokenizer(arg1, false);//开启智能切词
Map paramsMap=new HashMap();
paramsMap.put("luceneMatchVersion", "LUCENE_43");
paramsMap.put("synonyms", "C:\\同义词\\synonyms.txt");
SynonymFilterFactory factory=new SynonymFilterFactory(paramsMap);
FilesystemResourceLoader loader = new FilesystemResourceLoader();
try {
factory.inform(loader);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return new TokenStreamComponents(token, factory.create(token));
}
}
今天一个不小心被“中英文的标点符号”征服了一个上午,由于在同义词文件中只能用英语“,”对一组同义词分开,然而在自己的不细心的情况下,让有一组词用了中文的''
,"各种测试都过不了,最后经过一步步的查找终于找到了问题。
这里对同义词查找问题应该注意的一些问题:
①、文件编码问题
②、分割符号问题