import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MaxWordSeg;
import com.chenlb.mmseg4j.analysis.MMSegTokenizer;
public class MySameworkAnalyzer extends Analyzer {
@Override
public TokenStream tokenStream(String str, Reader reader) {
//获取中文分词器的字段,我这里使用的是MMSeg4j的中文分词器
Dictionary dic=Dictionary.getInstance("F:\\官方包\\lucene-3.5.0\\mmseg4j-1.8.5\\data");
return new MySameworkFilter(new MMSegTokenizer(new MaxWordSeg(dic), reader));
}
}
#####################
@Test
public void test05(){
try {
Analyzer a1=new MySameworkAnalyzer();
String str="我来自中国,我的名字叫什么";
AnalyzerUtil.displayToken(str, a1);
Directory directory=new RAMDirectory();
IndexWriter indexWriter=new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, a1));
Document document=new Document();
document.add(new Field("content", str,Field.Store.YES,Field.Index.ANALYZED));
indexWriter.addDocument(document);
indexWriter.close();
IndexReader indexReader=IndexReader.open(directory);
IndexSearcher searcher=new IndexSearcher(indexReader);
TopDocs tds=searcher.search(new TermQuery(new Term("content", "大陆")), 10);
ScoreDoc[] docs=tds.scoreDocs;
Document doc=searcher.doc(docs[0].doc);
System.out.println(doc.get("content"));
searcher.close();
indexReader.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
###############
package com.lucene.util;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;
public class MySameworkFilter extends TokenFilter {
//保存相应的词汇
private CharTermAttribute cta=null;
//保存词与词之间的位置增量
private PositionIncrementAttribute pia=null;
//定义一个状态
private AttributeSource.State current=null;
//用栈保存同义词集合
private Stack<String> sames=null;
protected MySameworkFilter(TokenStream input) {
super(input);
cta=this.addAttribute(CharTermAttribute.class);
pia=this.addAttribute(PositionIncrementAttribute.class);
sames=new Stack<String>();
}
@Override
public boolean incrementToken() throws IOException {
if(sames.size()>0){
//将元素出栈,并获取同义词
String str=sames.pop();
//还原状态
restoreState(current);
//先清空,再添加
cta.setEmpty();
cta.append(str);
//设置位置为0,表示同义词
pia.setPositionIncrement(0);
return true;
}
if(!this.input.incrementToken())
return false;
//如果改词中有同义词,捕获当前状态
if(this.getSamewords(cta.toString())){
current=captureState();
}
return true;
}
//定义同义词字典,并判断如果有同义词就返回true
private boolean getSamewords(String key){
Map<String, String[]> maps=new HashMap<String, String[]>();
maps.put("我", new String[]{"咱","俺"});
maps.put("中国", new String[]{"大陆","天朝"});
if(maps.get(key)!=null){
for(String s:maps.get(key)){
sames.push(s);
}
}
if(sames.size()>0){
return true;
}
return false;
}
}