Lucene7.4做数据匹配

5年，目标200w

已于 2023-06-02 10:22:11 修改

阅读量217

点赞数

文章标签： lucene java apache

于 2023-06-02 10:18:13 首次发布

本文链接：https://blog.csdn.net/weixin_39184713/article/details/131001664

版权

1.jar包引用

        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-core</artifactId>
            <version>7.4.0</version>
            <scope>compile</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-queryparser</artifactId>
            <version>7.4.0</version>
            <scope>compile</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-analyzers-common</artifactId>
            <version>7.4.0</version>
            <scope>compile</scope>
        </dependency>
        <dependency>
            <groupId>com.janeluo</groupId>
            <artifactId>ikanalyzer</artifactId>
            <version>2012_u6</version>
        </dependency>

2.重写Ik，兼容高版Lucene7.4

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.synonym.SynonymGraphFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;

import java.io.Reader;
import java.util.List;

public final class IKAnalyzerExt extends Analyzer {
    //同义词库
    private SynonymMap synonymMap;
    //禁用词库
    private List<String> stopWords;
    //默认使用最细力度分词，设置为true表示粗粒度分词
    private boolean useSmart;

    public boolean useSmart() {
        return this.useSmart;
    }

    public void setUseSmart(boolean useSmart) {
        this.useSmart = useSmart;
    }

    public IKAnalyzerExt() {
        //默认使用最细力度分词，设置为true表示粗粒度分词
        this(false);
    }

    public IKAnalyzerExt(boolean useSmart) {
        this.useSmart = useSmart;
    }

    public IKAnalyzerExt(SynonymMap synonymMap, List<String> stopWords, boolean useSmart) {
        //默认使用最细力度分词，设置为true表示粗粒度分词
        this(useSmart);
        this.synonymMap = synonymMap;
        this.stopWords = stopWords;
    }

    @Override
    protected TokenStreamComponents createComponents(String s) {
        Tokenizer _IKTokenizerExt = new IKTokenizerExt(this.useSmart());
        TokenStream tokenStream;

        //同义词初始化
        if (synonymMap != null && synonymMap.fst != null && synonymMap.maxHorizontalContext > 0) {
            tokenStream = new SynonymGraphFilter(_IKTokenizerExt, synonymMap, true);
        } else {
            tokenStream = _IKTokenizerExt;
        }
        //禁用词初始化
        if (stopWords != null && stopWords.size() > 0) {
            tokenStream = new StopFilter(tokenStream, StopFilter.makeStopSet(stopWords));
            return new TokenStreamComponents(_IKTokenizerExt, tokenStream) {
                @Override
                protected void setReader(final Reader reader) {
                    super.setReader(reader);
                }
            };
        }
        return new TokenStreamComponents(_IKTokenizerExt, tokenStream);
    }
}

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

import java.io.IOException;

public final class IKTokenizerExt extends Tokenizer {
    //ik分词器
    private IKSegmenter _IKImplement;
    private final CharTermAttribute termAtt = this.addAttribute(CharTermAttribute.class);
    private final OffsetAttribute offsetAtt = this.addAttribute(OffsetAttribute.class);
    private final TypeAttribute typeAtt = this.addAttribute(TypeAttribute.class);
    private int endPosition;

    public IKTokenizerExt(boolean useSmart) {
        this._IKImplement = new IKSegmenter(this.input, useSmart);
    }

    public boolean incrementToken() throws IOException {
        this.clearAttributes();
        Lexeme nextLexeme = this._IKImplement.next();
        if (nextLexeme != null) {
            this.termAtt.append(nextLexeme.getLexemeText());
            this.termAtt.setLength(nextLexeme.getLength());
            this.offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
            this.endPosition = nextLexeme.getEndPosition();
            this.typeAtt.setType(nextLexeme.getLexemeTypeString());
            return true;
        } else {
            return false;
        }
    }

    public void reset() throws IOException {
        super.reset();
        this._IKImplement.reset(this.input);
    }

    public final void end() {
        int finalOffset = this.correctOffset(this.endPosition);
        this.offsetAtt.setOffset(finalOffset, finalOffset);
    }
}

3、构建匹配引擎

import com.lawtrust.classification.web.modules.task.lucene.ik.IKAnalyzerExt;
import lombok.extern.log4j.Log4j2;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.CharsRef;
import org.springframework.stereotype.Component;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.stream.Collectors;

/**
 * @DESCRIPTION:lucene匹配引擎
 * @USER: zenglingceng
 * @DATE: 2023/5/4 13:36
 */
@Log4j2
@Component
public class LuceneEngine {

    /**
     * 构建lucene文档
     * analyzer:分词器
     * features:特征数据
     */
    public IndexSearcher getIndexSearcher(Analyzer analyzer, List<BaseFieldEntity> features) throws IOException {

        //构建lucene文档
        Directory index = new RAMDirectory();
        IndexWriterConfig config = new IndexWriterConfig(analyzer);
        IndexWriter writer = new IndexWriter(index, config);
        /*Field解释
        TextField
        分词，索引，存储可选。
        StringField
        不分词，索引，存储可选。
        IntPoint,LongPoint,FloatPoint,DoublePoint
        分词，索引，不存储(如需存储，添加独立的StoredField)。
        StoredField
        不分词，不索引，存储。
        */
        //fixme 通过数据库获取特征数据构建 lucene文档

        features.stream().forEach(x -> {
            //数据项名称
            String key = x.getFieldClassificationName();
            String description = x.getDescription();
            if(key!=null) {
                //分词
                Document doc = new Document();
                doc.add(new TextField("key", key.toLowerCase(Locale.ROOT), Field.Store.YES));
                doc.add(new StringField("value", x.getId() + "", Field.Store.YES));
                //不分词
                Document doc1 = new Document();
                doc1.add(new StringField("key", key.toLowerCase(Locale.ROOT), Field.Store.YES));
                doc1.add(new StringField("value", x.getId() + "", Field.Store.YES));
                try {
                    writer.addDocument(doc);
                    writer.addDocument(doc1);
                } catch (Exception e) {
                    log.error("特征数据构建lucene文档错误 {}", x);
                }
            }
            //定义说明
            if(description!=null) {
                //分词
                Document doc = new Document();
                doc.add(new TextField("key", description.toLowerCase(Locale.ROOT), Field.Store.YES));
                doc.add(new StringField("value", x.getId() + "", Field.Store.YES));
                //不分词
                Document doc1 = new Document();
                doc1.add(new StringField("key", description.toLowerCase(Locale.ROOT), Field.Store.YES));
                doc1.add(new StringField("value", x.getId() + "", Field.Store.YES));
                try {
                    writer.addDocument(doc);
                    writer.addDocument(doc1);
                } catch (Exception e) {
                    log.error("特征数据构建lucene文档错误 {}", x);
                }
            }
        });
        writer.close();
        IndexReader reader = DirectoryReader.open(index);
        return new IndexSearcher(reader);
    }

    /**
     * 构建分词器
     * thesaurusList:词库
     * useSmart: true表示粗粒度，false表示细粒度
     */
    public IKAnalyzerExt getIkAnalyzerExt(List<BaseThesaurusEntity> thesaurusList, boolean useSmart) throws IOException {

        //构建同义词
        SynonymMap synonymMap = null;
        //构建禁用词
        List<String> stopWords = null;
        //fixme 通过数据库获词库
        if (thesaurusList != null && thesaurusList.size() > 0) {
            //根据wordType分组
            Map<Integer, List<BaseThesaurusEntity>> collect = thesaurusList.stream().collect(Collectors.groupingBy(x -> x.getWordType()));
            //同义词
            List<BaseThesaurusEntity> synonyms = collect.get(0);
            //禁用词
            List<BaseThesaurusEntity> stops = collect.get(1);
            if (synonyms != null && synonyms.size() > 0) {
                SynonymMap.Builder builder = new SynonymMap.Builder(true);
                synonyms.stream().forEach(x -> {
                    String word = x.getWord();
                    String relWord = x.getRelaWord();
                    if (word != null && relWord != null) {
                        String[] split = relWord.split("\\,");
                        for (String synonym : split) {
                            builder.add(new CharsRef(word), new CharsRef(synonym), true);
                        }
                    }
                });
                synonymMap = builder.build();
            }

            if (stops != null && stops.size() > 0) {
                stopWords = new ArrayList<>();
                List<String> finalStopWords = stopWords;
                stops.stream().forEach(x -> {
                    String relWord = x.getRelaWord();
                    if (relWord != null) {
                        String[] split = relWord.split("\\,");
                        for (String stop : split) {
                            finalStopWords.add(stop);
                        }
                    }
                });
            }
        }

        IKAnalyzerExt iKAnalyzerExt = new IKAnalyzerExt(synonymMap, stopWords, useSmart);
        return iKAnalyzerExt;
    }

    public static void main(String[] args) throws IOException, ParseException {
//        LuceneEngine match = new LuceneEngine();
//        IKAnalyzerExt ikAnalyzerExt = match.getIkAnalyzerExt(null,false);
//        String fengci="User_Name";
//        String _fieldName = fengci.replaceAll("_", "");
//        StringReader readerTest = new StringReader(_fieldName);
//        TokenStream tokenStream = ikAnalyzerExt.tokenStream("", readerTest);
//        tokenStream.reset();
//        CharTermAttribute termAttribute = tokenStream.addAttribute(CharTermAttribute.class);
//        while (tokenStream.incrementToken()) {
//            System.out.println(termAttribute.toString());
//        }
//        tokenStream.close();
//        String __fieldName = fengci.replaceAll("_", " ");
//        StringReader readerTest1 = new StringReader(__fieldName);
//        TokenStream tokenStream1 = ikAnalyzerExt.tokenStream("", readerTest1);
//        tokenStream1.reset();
//        CharTermAttribute termAttribute1 = tokenStream1.addAttribute(CharTermAttribute.class);
//        while (tokenStream1.incrementToken()) {
//            System.out.println(termAttribute1.toString());
//        }

//        IndexSearcher searcher = match.getIndexSearcher(ikAnalyzerExt);
//        String query = "w";
//        Query parse = new QueryParser("key", ikAnalyzerExt).parse(query);
//        Query term = new TermQuery(new Term("key", query));
//        Query matchall = new MatchAllDocsQuery();
//        TopDocs docs = searcher.search(parse, 3);
//        System.out.println(docs.totalHits + " matching documents found.");
//        for (ScoreDoc scoreDoc : docs.scoreDocs) {
//            Document doq = searcher.doc(scoreDoc.doc);
//            System.out.println(doq.get("key") + " (score: " + scoreDoc.score + ")");
//            System.out.println(doq.get("value") + " (score: " + scoreDoc.score + ")");
//        }

    }
}