Lucene自定义同义词分词器

package com.lucene.util;
 
 import java.io.Reader;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 
 import com.chenlb.mmseg4j.Dictionary;
 import com.chenlb.mmseg4j.MaxWordSeg;
 import com.chenlb.mmseg4j.analysis.MMSegTokenizer;
 
 public class MySameworkAnalyzer extends Analyzer {
 
     @Override
     public TokenStream tokenStream(String str, Reader reader) {
         //获取中文分词器的字段,我这里使用的是MMSeg4j的中文分词器
         Dictionary dic=Dictionary.getInstance("F:\\官方包\\lucene-3.5.0\\mmseg4j-1.8.5\\data");
         return new MySameworkFilter(new MMSegTokenizer(new MaxWordSeg(dic), reader));
     }
 

 }

#####################

@Test
     public void test05(){
         try {
             Analyzer a1=new MySameworkAnalyzer();
             String str="我来自中国,我的名字叫什么";
             AnalyzerUtil.displayToken(str, a1);
             Directory directory=new RAMDirectory();
             IndexWriter indexWriter=new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, a1));
             Document document=new Document();
             document.add(new Field("content", str,Field.Store.YES,Field.Index.ANALYZED));
             indexWriter.addDocument(document);
             indexWriter.close();
             IndexReader indexReader=IndexReader.open(directory);
             IndexSearcher searcher=new IndexSearcher(indexReader);
             TopDocs tds=searcher.search(new TermQuery(new Term("content", "大陆")), 10);
             ScoreDoc[] docs=tds.scoreDocs;
             Document doc=searcher.doc(docs[0].doc);
             System.out.println(doc.get("content"));
             searcher.close();
             indexReader.close();
         } catch (CorruptIndexException e) {
             e.printStackTrace();
         } catch (LockObtainFailedException e) {
             e.printStackTrace();
         } catch (IOException e) {
             e.printStackTrace();
         }
     }

###############

package com.lucene.util;
 
 import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Stack;
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.util.AttributeSource;
 
 public class MySameworkFilter extends TokenFilter  {
 
     //保存相应的词汇
     private CharTermAttribute cta=null;
     //保存词与词之间的位置增量
     private PositionIncrementAttribute pia=null;
     //定义一个状态
     private AttributeSource.State current=null;
     //用栈保存同义词集合
     private Stack<String> sames=null;
     protected MySameworkFilter(TokenStream input) {
         super(input);
         cta=this.addAttribute(CharTermAttribute.class);
         pia=this.addAttribute(PositionIncrementAttribute.class);
         sames=new Stack<String>();
     }
 
 
     @Override
     public boolean incrementToken() throws IOException {
         if(sames.size()>0){
             //将元素出栈,并获取同义词
             String str=sames.pop();
             //还原状态
             restoreState(current);
             //先清空,再添加
             cta.setEmpty();
             cta.append(str);
             //设置位置为0,表示同义词
             pia.setPositionIncrement(0);
             return true;
         }
         
         if(!this.input.incrementToken())
         return false;
         
         //如果改词中有同义词,捕获当前状态
         if(this.getSamewords(cta.toString())){
             current=captureState();
         }
         
         return true;
     }
 
     //定义同义词字典,并判断如果有同义词就返回true
     private boolean getSamewords(String key){
         Map<String, String[]> maps=new HashMap<String, String[]>();
         maps.put("我", new String[]{"咱","俺"});
         maps.put("中国", new String[]{"大陆","天朝"});
         
         if(maps.get(key)!=null){
             for(String s:maps.get(key)){
                 sames.push(s);
             }
         }
         
         if(sames.size()>0){
             return true;
         }
         return false;
     }
 
 }


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值