lucene5.5 自定义分词器跟过滤器需要注意的

最新推荐文章于 2020-07-27 20:50:52 发布

ysm9099

最新推荐文章于 2020-07-27 20:50:52 发布

阅读量781

点赞数

分类专栏： java lucene5.5 文章标签： lucene

本文链接：https://blog.csdn.net/ysm9099/article/details/51350887

版权

java lucene5.5 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

public final class MySameFilter extends TokenFilter {
    //保存相应的词汇
    private CharTermAttribute cta=null;
    //保存词与词之间的位置增量
    private PositionIncrementAttribute pia=null;
    //定义一个状态
    private AttributeSource.State current=null;
    //用栈保存同义词集合
   private Stack<String> sames=null;
    protected MySameFilter(TokenStream input) {
        super(input);
        this.cta=addAttribute(CharTermAttribute.class);
        this.pia=addAttribute(PositionIncrementAttribute.class);
        sames=new Stack<String>();
    }


    @Override
    public boolean incrementToken() throws IOException {
        if(sames.size()>0){
            //将元素出栈,并获取同义词
            String str=sames.pop();
            //还原状态
            restoreState(current);
            //先清空,再添加
            cta.setEmpty();
            cta.append(str);
            //设置位置为0,表示同义词
            pia.setPositionIncrement(0);
            return true;
        }
        if (!input.incrementToken()) // #4
            return false;
        //如果改词中有同义词,捕获当前状态
        if(this.getSamewords(cta.toString())){
            current=captureState();
        }

        return true;
    }
    //定义同义词字典,并判断如果有同义词就返回true
    private boolean getSamewords(String key){
        Map<String, String[]> maps=new HashMap<String, String[]>();
        maps.put("我", new String[]{"咱","俺"});
        maps.put("中国", new String[]{"大陆","天朝"});

        if(maps.get(key)!=null){
            for(String s:maps.get(key)){
                sames.push(s);
            }
        }

        if(sames.size()>0){
            return true;
        }
        return false;
    }
}

public class MySameAnalyzer extends Analyzer {
    @Override
    protected TokenStreamComponents createComponents(String s) {
        //Tokenizer tokenizer = new StandardTokenizer();
        Dictionary dictionary = Dictionary.getInstance();
        MaxWordSeg mws = new MaxWordSeg(dictionary);
        MMSegTokenizer mt = new MMSegTokenizer(mws);
        MySameFilter msf = new MySameFilter(mt);
        return new TokenStreamComponents(mt, msf);
        //很多博客上面都是写的如下
       //return new TokenStreamComponents(tokenizer, msf);
     //这样写是不对的，本人试验过如果返回的里面传入tokenizer，会出现空指针异常，可能是因为5.5版本里面的mmseg分词器加了reset方法，
      而tokenstream也要调用reset方法导致。所以注意一定不要传tokenizer！

 }}