利用MMseg4j作为中文分词 自定义同义词分词器
MMseg4j-2.3jar包 : http://download.csdn.net/detail/u010167215/9178257
MMseg4j-2.3 源码: http://download.csdn.net/detail/u010167215/9178265
Code 1:MySameWordTokenFilter
public class MySameWordTokenFilter extends TokenFilter{
private CharTermAttribute cta = null;//自定义流的属性
private PositionIncrementAttribute pia = null;
private SamewordEngine samewordEngine; //同义词引擎
//input 输入流
protected MySameWordTokenFilter(TokenStream input,SamewordEngine engine) {
super(input);
this.samewordEngine = engine;
cta = this.addAttribute(CharTermAttribute.class);//对流进行属性添加
pia = this.addAttribute(PositionIncrementAttribute.class);
}
private Stack<String> sameWordStack = new Stack<String>(); //用于存储每个词的同义词 用List也行
private AttributeSource.State currentState; //记录当前状态
/**
* 该方法用于添加同义词
* 将同义词压入到栈中
* @param key 原词
* @return 有同义词返回true 没有同义词返回false
*/
private boolean addSameWords(String key){
String[] sws = samewordEngine.getSameword(key);
if(sws!=null){
for(String str:sws){
//遍历放到栈中
sameWordStack.push(str);
}
return true;
}else{
return false;
}
}
@Override
public boolean incrementToken() throws IOException {
//如果栈非空 证明还有元素
if(sameWordStack.size()>0){
//将元素出栈 并且获取这个同义词
String str = sameWordStack.pop();
//还原状态
restoreState(currentState);
//System.out.println("-------"+cta);
cta.setEmpty();
cta.append(str);
//设置位置0
pia.setPositionIncrement(0);
// this.reset();
return true;
}
// input.reset();
//取完了再进行下一步
if(! input.incrementToken()) return false; //已经没有数据了 空 输出false
if(addSameWords(cta.toString())){
currentState = captureState();
}
return true;
}
}
Code 2:MySameWordAnalyzer
public class MySameWordAnalyzer extends Analyzer {
private SamewordEngine samewordEngine;
public MySameWordAnalyzer(SamewordEngine samewordEngine){
this.samewordEngine = samewordEngine;
}
@Override
protected TokenStreamComponents createComponents(String arg0) {
//MMSegTokenizer
Dictionary dic = Dictionary.getInstance("/xxxpath"); //词库路径
//Dictionary dic = Dictionary.getInstance(); //利用默认路径
MMSegTokenizer mmSegTokenizer = new MMSegTokenizer(new MaxWordSeg(dic));
MySameWordTokenFilter mySameWordTokenFilter = new MySameWordTokenFilter(mmSegTokenizer , samewordEngine);
return new TokenStreamComponents(mmSegTokenizer, mySameWordTokenFilter);
}
}
Code 3: SamewordEngine
/**
* 可以通过该接口加载文件中的同义词
*/
public interface SamewordEngine {
public String[] getSameword(String key);
}
Code 4:SimpleSamewordEngine
public class SimpleSamewordEngine implements SamewordEngine {
private Map<String,String[]> samewordsMap = new HashMap<String,String[]>(); //每个词的同义词
public SimpleSamewordEngine() {
super();
//添加同义词
samewordsMap.put("我", new String[]{"俺","咋"});
}
@Override
public String[] getSameword(String key) {
return samewordsMap.get(key);
}
}
Code 5:测试代码
/**
* 测试同义词查询
*/
@Test
public void testSameWorldAnalyzer02(){
//一、Index
//1.创建Directory
//2.创建IndexWriter
//3.创建Document对象
//4.为Document添加Field //遍历文件
//5.通过IndexWriter 添加文档到索引中
//6.关闭IndexWriter
//二、Search
//1.创建Directory(去哪个地方找)
//2.创建IndexReader(通过这个IndexReader来读取所有的索引)
//3.根据IndexReader 创建 IndexSearcher
//4.创建搜索的Query(就像mysql语句一样) 参数:1.版本,2.要搜索的域(之前创建了content、filename、path),3.分词器
//(1)创建QueryParser (2) 通过QueryParser创建Query
//如果QueryParser 的包域内核包不同 会报Lucene java.lang.NoSuchFieldError
//5.根据searcher搜索并返回TopDocs文档
//6.根据TopDocs获取ScoreDoc对象(评分对象)
//循环获取文档
//7.根据seacher和ScoreDoc对象获取具体的Document对象
//8.根据Document 对象获取需要的值
//9.关闭search
Analyzer mySameWordAnalyzer = new MySameWordAnalyzer(new SimpleSamewordEngine()); //抽象出接口
String text = "我来自中国华南农业大学网路工程专业";
try {
Directory directory = new RAMDirectory();
IndexWriter iw = new IndexWriter(directory, new IndexWriterConfig(mySameWordAnalyzer));
Document doc = new Document();
doc.add(new TextField("content", text, Store.YES));
iw.addDocument(doc);
iw.close();
IndexReader indexReader = DirectoryReader.open(directory);
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
TermQuery query = new TermQuery(new Term("content","俺")); //在content中搜索俺
TopDocs tds = indexSearcher.search(query, 10);
ScoreDoc[] sds = tds.scoreDocs;
Document document = indexSearcher.doc(sds[0].doc);
System.out.println("----->"+document.get("content"));
indexReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}