一、目前存在的问题
在getSameWords()方法中,我们使用map临时存放了两个键值对用来测试,实际开发中,往往需要很多的这种键值对来处理,比如从某个同义词词典里面获取值之类的,所以说,我们需要一个类,根据key提供近义词。
为了能更好的适应应用场景,我们先定义一个接口,其中定义一个getSameWords()方法,在定义一个实现类,实现getSameWords()方法,当我们需要更换字典的时候,更换实现类,重新实现getSameWords()方法提供不同的同义词即可。
二、代码实现
package com.wsy;
public interface SameWordContext {
public String[] getSameWords(String key);
}
package com.wsy;
import java.util.HashMap;
import java.util.Map;
public class SimpleSameWordContext implements SameWordContext {
Map<String, String[]> map = new HashMap();
public SimpleSameWordContext() {
map.put("中国", new String[]{"天朝", "大陆"});
map.put("我", new String[]{"俺", "咱"});
// 或者是读取同义词词典,将字典里的值放入map中
}
@Override
public String[] getSameWords(String key) {
return map.get(key);
}
}
package com.wsy;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;
public class MySameTokenFilter extends TokenFilter {
private CharTermAttribute charTermAttribute;
private PositionIncrementAttribute positionIncrementAttribute;
private State state;
private Stack<String> stack;
private SameWordContext sameWordContext;
protected MySameTokenFilter(TokenStream input, SameWordContext sameWordContext) {
super(input);
charTermAttribute = this.addAttribute(CharTermAttribute.class);
positionIncrementAttribute = this.addAttribute(PositionIncrementAttribute.class);
stack = new Stack();
this.sameWordContext = sameWordContext;
}
// 这里的incrementToken()方法有点像iterator.hasnext()
// 如果后面还有待处理的元素,那么返回true
// 如果后面没有待处理的元素,那么返回false
@Override
public boolean incrementToken() throws IOException {
// 如果栈中有同义词
if (stack.size() > 0) {
// 出栈并拿到这个元素
String string = stack.pop();
// 还原状态即获取到之前状态的一个副本
restoreState(state);
// 将当前token的内容清空并添加上同义词
charTermAttribute.setEmpty();
charTermAttribute.append(string);
// 设置当前token和前一个token的间隔是0,也就是和前一个的位置一样
positionIncrementAttribute.setPositionIncrement(0);
return true;
}
if (input.incrementToken() == false) {
return false;
}
if (getSameWords(charTermAttribute.toString())) {
// 如果有同义词就捕获当前状态
state = captureState();
}
return true;
}
private boolean getSameWords(String key) {
String[] sameWords = sameWordContext.getSameWords(key);
if (sameWords != null) {
for (String sameWord : sameWords) {
stack.push(sameWord);
}
return true;
}
return false;
}
}
package com.wsy;
import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MaxWordSeg;
import com.chenlb.mmseg4j.analysis.MMSegTokenizer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
public class MySameAnalyzer extends Analyzer {
private SameWordContext sameWordContext;
public MySameAnalyzer(SameWordContext sameWordContext) {
this.sameWordContext = sameWordContext;
}
@Override
public TokenStream tokenStream(String string, Reader reader) {
// 指定分词字典
Dictionary dictionary = Dictionary.getInstance("E:\\Lucene\\mmseg4j-1.8.5\\data");
return new MySameTokenFilter(new MMSegTokenizer(new MaxWordSeg(dictionary), reader), sameWordContext);
}
public static void displayAllToken(String string, Analyzer analyzer) {
try {
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(string));
// 放入属性信息,为了查看流中的信息
// 位置增量信息,语汇单元之间的距离
PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);
// 每个语汇单元的位置偏移量信息
OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
// 每一个语汇单元的分词信息
CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
// 使用的分词器的类型信息
TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);
while (tokenStream.incrementToken()) {
System.out.println(positionIncrementAttribute.getPositionIncrement() + ":" + charTermAttribute + "[" + offsetAttribute.startOffset() + "-" + offsetAttribute.endOffset() + "]-->" + typeAttribute.type());
}
System.out.println("----------------------------");
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws IOException {
String string = "我来自中国";
MySameAnalyzer analyzer = new MySameAnalyzer(new SimpleSameWordContext());
Directory directory = new RAMDirectory();
IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, analyzer));
Document document = new Document();
document.add(new Field("content", string, Field.Store.YES, Field.Index.ANALYZED));
indexWriter.addDocument(document);
indexWriter.close();
IndexSearcher indexSearcher = new IndexSearcher(IndexReader.open(directory));
TopDocs topDocs = indexSearcher.search(new TermQuery(new Term("content", "天朝")), 10);
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
if (scoreDocs.length > 0) {
document = indexSearcher.doc(scoreDocs[0].doc);
System.out.println(document.get("content"));
}
MySameAnalyzer.displayAllToken(string, analyzer);
}
}