吐槽:
1.这周忙如狗,忘记学习了,目前还处于抄别人代码,不知所以然的尴尬地步。。。
说明:
1.分词的数据流:reader->tokenizer->多个tokenFilter过滤->tokenStream
2.用到了中文同义词,需要mmseg4Jjar包的支持,主要使用到人家的分词器(MMSegTokenizer类),这个咱目前还是直接用现成的。
然后自定义了tokenizefFilter,最后自定义了一个中文分词器MySynonymAnalyzer
3.实现要有个词库文件,如代码中的词库目录 D:\test\dictory,保存中文词库信息构建MMSegTokenizer对象是需要读取词库信息
可以自己网上找额。。。
代码:
1.自定义分词过滤器
package synonym;
import java.io.IOException;
import java.util.Queue;
import java.util.concurrent.LinkedBlockingQueue;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* 同义词过滤器
*
* @version 2014-8-25 下午01:56:08
*
*/
public class MySynonymFilter extends TokenFilter {
private CharTermAttribute cta;
private PositionIncrementAttribute pia;
private AttributeSource.State current;
// 保存同义词信息
private Queue<String> synonymQueue;
// 自定义获取同义词接口
private SynonymContext ynonymContext;
protected MySynonymFilter(TokenStream input, SynonymContext ynonymContext) {
super(input);
cta = this.addAttribute(CharTermAttribute.class);
pia = this.addAttribute(PositionIncrementAttribute.class);
synonymQueue = new LinkedBlockingQueue<String>();
this.ynonymContext = ynonymContext;
}
@Override
public boolean incrementToken() throws IOException {
// 是否是同义词替换
if(synonymQueue.size()>0){
String sysnonymStr = synonymQueue.poll();
// 先还原状态
restoreState(current);
// 设置同义词
cta.setEmpty();
cta.append(sysnonymStr);
// 挂在同一个位置
pia.setPositionIncrement(0);
return true;
}
if(!this.input.incrementToken()){
return false;
}else{
if(addSynonym(cta.toString())){
// 若存在同义词,保存当前状态
current = captureState();
}
return true;
}
}
private boolean addSynonym(String source) {
String[] allStrs = ynonymContext.getSamewords(source);
if (allStrs != null && allStrs.length > 0) {
for (String string : allStrs) {
synonymQueue.add(string);
}
return true;
}
return false;
}
}
2.自定义同义词分词器
package synonym;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MaxWordSeg;
import com.chenlb.mmseg4j.analysis.MMSegTokenizer;
/**
* 简单的同义词分词器
* @version 2014-8-25 上午10:46:21
*
*/
public class MySynonymAnalyzer extends Analyzer {
// 词库目录 D:\test\dictory
protected Dictionary dic;
public MySynonymAnalyzer(String path) {
dic = Dictionary.getInstance(path);
}
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
// reader->tokenizer->多个tokenFilter->tokenStream
Tokenizer source = new MMSegTokenizer(new MaxWordSeg(dic), reader);
TokenStream filter = new MySynonymFilter(source,new SimpleSynonymContext());
return new TokenStreamComponents(source,filter);
}
}