Lucene笔记19-Lucene的分词-实现自定义同义词分词器-实现分词器

最新推荐文章于 2024-04-09 09:33:49 发布

王劭阳

最新推荐文章于 2024-04-09 09:33:49 发布

阅读量348

点赞数

分类专栏： Lucene

本文链接：https://blog.csdn.net/qq_36059561/article/details/83448421

版权

Lucene 专栏收录该内容

50 篇文章 1 订阅

订阅专栏

一、同义词分词器的代码实现

package com.wsy;

import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MaxWordSeg;
import com.chenlb.mmseg4j.analysis.MMSegTokenizer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;

public class MySameAnalyzer extends Analyzer {
    @Override
    public TokenStream tokenStream(String string, Reader reader) {
        // 指定分词字典
        Dictionary dictionary = Dictionary.getInstance("E:\\Lucene\\mmseg4j-1.8.5\\data");
        return new MySameTokenFilter(new MMSegTokenizer(new MaxWordSeg(dictionary), reader));
    }

    public static void displayAllToken(String string, Analyzer analyzer) {
        try {
            TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(string));
            // 放入属性信息，为了查看流中的信息
            // 位置增量信息，语汇单元之间的距离
            PositionIncrementAttribute positionIncrementAttribute = tokenStream.addAttribute(PositionIncrementAttribute.class);
            // 每个语汇单元的位置偏移量信息
            OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
            // 每一个语汇单元的分词信息
            CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
            // 使用的分词器的类型信息
            TypeAttribute typeAttribute = tokenStream.addAttribute(TypeAttribute.class);
            while (tokenStream.incrementToken()) {
                System.out.println(positionIncrementAttribute.getPositionIncrement() + ":" + charTermAttribute + "[" + offsetAttribute.startOffset() + "-" + offsetAttribute.endOffset() + "]-->" + typeAttribute.type());
            }
            System.out.println("----------------------------");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) throws IOException {
        String string = "我来自中国山东聊城。";
        MySameAnalyzer analyzer = new MySameAnalyzer();
        Directory directory = new RAMDirectory();
        IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, analyzer));
        Document document = new Document();
        document.add(new Field("content", string, Field.Store.YES, Field.Index.ANALYZED));
        indexWriter.addDocument(document);
        indexWriter.close();
        IndexSearcher indexSearcher = new IndexSearcher(IndexReader.open(directory));
        TopDocs topDocs = indexSearcher.search(new TermQuery(new Term("content", "天朝")), 10);
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        if (scoreDocs.length > 0) {
            document = indexSearcher.doc(scoreDocs[0].doc);
            System.out.println(document.get("content"));
        }
        MySameAnalyzer.displayAllToken(string, analyzer);
    }
}

package com.wsy;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;

public class MySameTokenFilter extends TokenFilter {
    private CharTermAttribute charTermAttribute;
    private PositionIncrementAttribute positionIncrementAttribute;
    private State state;
    private Stack<String> stack;

    protected MySameTokenFilter(TokenStream input) {
        super(input);
        charTermAttribute = this.addAttribute(CharTermAttribute.class);
        positionIncrementAttribute = this.addAttribute(PositionIncrementAttribute.class);
        stack = new Stack();
    }

    // 这里的incrementToken()方法有点像iterator.hasnext()
    // 如果后面还有待处理的元素，那么返回true
    // 如果后面没有待处理的元素，那么返回false
    @Override
    public boolean incrementToken() throws IOException {
        // 如果栈中有同义词
        if (stack.size() > 0) {
            // 出栈并拿到这个元素
            String string = stack.pop();
            // 还原状态即获取到之前状态的一个副本
            restoreState(state);
            // 将当前token的内容清空并添加上同义词
            charTermAttribute.setEmpty();
            charTermAttribute.append(string);
            // 设置当前token和前一个token的间隔是0，也就是和前一个的位置一样
            positionIncrementAttribute.setPositionIncrement(0);
            return true;
        }
        if (input.incrementToken() == false) {
            return false;
        }
        if (getSameWords(charTermAttribute.toString())) {
            // 如果有同义词就捕获当前状态
            state = captureState();
        }
        return true;
    }

    private boolean getSameWords(String key) {
        Map<String, String[]> map = new HashMap();
        map.put("中国", new String[]{"天朝", "大陆"});
        map.put("我", new String[]{"俺", "咱"});
        String[] sameWords = map.get(key);
        if (sameWords != null) {
            for (String sameWord : sameWords) {
                stack.push(sameWord);
            }
            return true;
        }
        return false;
    }
}

参与搜索的字符串是“我来自中国山东聊城。”，我们使用了同义词分词器处理这个字符串，将“我”和“中国”做了同义词处理。当搜索“中国”的同义词“天朝”的时候，我们发现，同样可以把结果搜索出来，但是原字符串里面并没有“天朝”，这就是同义词分词器的作用效果了。

二、代码详细解释

单纯的看上面的代码，去理解这个过程可能有些吃力，那么下面，我就带着大家一步一步的来分析下，上面这段程序是怎么执行的，更好的理解一下同义词分词器的运行原理和实现原理，这里主要看一下MySameTokenFilter类中的incrementToken()方法，理解了这个方法，就知道了怎么实现的同义词的添加了。

既然说到同义词，那么就要用一个map去存储，map的key是一个词语，map的value是这个词语的所有同义词，所以泛型写法是<String, String[]>，这里用到一个数据结构：栈。为什么要用到栈呢？因为我们在处理同义词的时候，当把一个同义词添加到tokenStream中之后，这个同义词就用不到了，于是就需要从集合中移除，一是为了处理剩下的同义词，二是为了防止干扰，自然而然就想到了需要使用栈来解决这个问题。这里我们先模拟创建一个map，自己定义两个同义词，即“中国”和{“天朝”与“大陆”}，“我”和{“俺”与“咱”}。其中getSameWords()方法就是根据key来获取同义词，如果获取到了同义词，就将同义词放到栈中备用，并返回true表示有同义词需要处理，否则返回false即可。

如果待分词的字符串是“我来自中国山东聊城”，不做任何处理的情况下，会被分成{"我","来自","中国","山东","聊","城"}，下面来看incrementToken()方法。

第一个字分词“我”的时候，stack是空的，因为还没有执行到后面的getSameWords()方法，第一个if没有进去，因为后面还有元素，第二个if没有进去，第三个if进去了，因为getSameWords("我")返回true，将“俺”与“咱”压入了栈中，并保存了当前分词“我”的状态，因为走到“来自”的时候，“我”的状态就被覆盖了，所以，对于需要做同义词的词语，需要保存状态，防止被后面的覆盖，最后方法返回true。incrementToken()方法的返回值有点像iterator.hasnext()，如果有下一个元素，返回true，没有下一个元素返回false。因为还有元素，incrementToken()方法会再次执行，第一个if进去了，因为此时栈中有两个元素，弹出栈顶元素并获取，当前遍历到的元素是“来自”，所以需要还原回“我”的状态，将charTermAttribute进行清空并赋新值，设置和前一个元素的间隔是0，也就意味着“咱”和“我”在同一个位置，此时方法体返回true。继续执行，第一个if又进去了，因为还有“俺”在栈中，同样的操作，方法体返回true。

当前遍历的元素是“来自”，查看栈中已经是空的，第一个if没进去，第二个if没进去，第三个if没进去，方法体返回true。

当前遍历的元素是“中国”，类比“我”的情景来思考一下即可。

我突然发奇想了，当我把待分词的字符串改为“我来自中国”后，“中国”后面没有东西了，还会处理“中国”的同义词吗？运行之后，发现“中国”的同义词也做了处理，单步调试看看为什么，当前元素走到“中国”的时候，incrementToken()并不是false，Lucene自动在最后一个分词后加了一个空字符串，所以处理完了“中国”两个字，当前元素是空串，才返回false。

最后看一下运行结果吧。可以看出来“俺”和“咱”的positionIncrement的值是0，意味着它们和“我”在同一个位置的。

我来自中国
1:我[0-1]-->word
0:咱[0-1]-->word
0:俺[0-1]-->word
1:来自[1-3]-->word
1:中国[3-5]-->word
0:大陆[3-5]-->word
0:天朝[3-5]-->word

在getSameWords()方法中，还有待优化的地方，具体优化内容请看下一篇笔记。

王劭阳

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Lucene笔记19-Lucene的分词-实现自定义同义词分词器-实现分词器

一、同义词分词器的代码实现package com.wsy;import com.chenlb.mmseg4j.Dictionary;import com.chenlb.mmseg4j.MaxWordSeg;import com.chenlb.mmseg4j.analysis.MMSegTokenizer;import org.apache.lucene.analysis.Analy...
复制链接

扫一扫