Lucene自定义同义词分词器

最新推荐文章于 2019-03-16 16:50:28 发布

screensky

最新推荐文章于 2019-03-16 16:50:28 发布

阅读量4k

点赞数

分类专栏： Luence 文章标签： lucene string null dictionary input class

Luence 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

package com.lucene.util;

import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;

import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MaxWordSeg;
import com.chenlb.mmseg4j.analysis.MMSegTokenizer;

public class MySameworkAnalyzer extends Analyzer {

     @Override
     public TokenStream tokenStream(String str, Reader reader) {
         //获取中文分词器的字段,我这里使用的是MMSeg4j的中文分词器
         Dictionary dic=Dictionary.getInstance("F:\\官方包\\lucene-3.5.0\\mmseg4j-1.8.5\\data");
         return new MySameworkFilter(new MMSegTokenizer(new MaxWordSeg(dic), reader));
     }

}

#####################

@Test
     public void test05(){
         try {
             Analyzer a1=new MySameworkAnalyzer();
             String str="我来自中国,我的名字叫什么";
             AnalyzerUtil.displayToken(str, a1);
             Directory directory=new RAMDirectory();
             IndexWriter indexWriter=new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, a1));
             Document document=new Document();
             document.add(new Field("content", str,Field.Store.YES,Field.Index.ANALYZED));
             indexWriter.addDocument(document);
             indexWriter.close();
             IndexReader indexReader=IndexReader.open(directory);
             IndexSearcher searcher=new IndexSearcher(indexReader);
             TopDocs tds=searcher.search(new TermQuery(new Term("content", "大陆")), 10);
             ScoreDoc[] docs=tds.scoreDocs;
             Document doc=searcher.doc(docs[0].doc);
             System.out.println(doc.get("content"));
             searcher.close();
             indexReader.close();
         } catch (CorruptIndexException e) {
             e.printStackTrace();
         } catch (LockObtainFailedException e) {
             e.printStackTrace();
         } catch (IOException e) {
             e.printStackTrace();
         }
     }

###############

package com.lucene.util;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;

public class MySameworkFilter extends TokenFilter {

     //保存相应的词汇
     private CharTermAttribute cta=null;
     //保存词与词之间的位置增量
     private PositionIncrementAttribute pia=null;
     //定义一个状态
     private AttributeSource.State current=null;
     //用栈保存同义词集合
     private Stack<String> sames=null;
     protected MySameworkFilter(TokenStream input) {
         super(input);
         cta=this.addAttribute(CharTermAttribute.class);
         pia=this.addAttribute(PositionIncrementAttribute.class);
         sames=new Stack<String>();
     }

     @Override
     public boolean incrementToken() throws IOException {
         if(sames.size()>0){
             //将元素出栈,并获取同义词
             String str=sames.pop();
             //还原状态
             restoreState(current);
             //先清空,再添加
             cta.setEmpty();
             cta.append(str);
             //设置位置为0,表示同义词
             pia.setPositionIncrement(0);
             return true;
         }

         if(!this.input.incrementToken())
         return false;

         //如果改词中有同义词,捕获当前状态
         if(this.getSamewords(cta.toString())){
             current=captureState();
         }

         return true;
     }

     //定义同义词字典,并判断如果有同义词就返回true
     private boolean getSamewords(String key){
         Map<String, String[]> maps=new HashMap<String, String[]>();
         maps.put("我", new String[]{"咱","俺"});
         maps.put("中国", new String[]{"大陆","天朝"});

         if(maps.get(key)!=null){
             for(String s:maps.get(key)){
                 sames.push(s);
             }
         }

         if(sames.size()>0){
             return true;
         }
         return false;
     }

}

screensky

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Lucene自定义同义词分词器

package com.lucene.util; import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import com.chenlb.mmseg4j.Dictionary; impo
复制链接

扫一扫