lucene搜索

最新推荐文章于 2022-12-04 15:57:36 发布

小海龟的吻

最新推荐文章于 2022-12-04 15:57:36 发布

阅读量212

点赞数 1

分类专栏：学习笔记文章标签： lucene 全文检索高亮显示

本文链接：https://blog.csdn.net/qq_35618489/article/details/86984413

版权

学习笔记专栏收录该内容

12 篇文章 0 订阅

订阅专栏

Lucene是apache软件基金会4 jakarta项目组的一个子项目，是一个开放源代码的全文检索引擎工具包，但它不是一个完整的全文检索引擎，而是一个全文检索引擎的架构，提供了完整的查询引擎和索引引擎，部分文本分析引擎（英文与德文两种西方语言）。

lucene的各版本下载地址：

http://mirror.bit.edu.cn/apache/lucene/java/

本文主要介绍lucene的简单使用。

luke的各个版本下载地址：https://github.com/DmitryKey/luke/releases?after=luke-7.2.0

1.项目结构

2. pom.xml的依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.wjx</groupId>
    <artifactId>lucene</artifactId>
    <version>1.0-SNAPSHOT</version>


    <!--lucene的奔版本号-->
    <properties>
        <lucene.version>6.6.5</lucene.version>
    </properties>

    <dependencies>
        <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-core -->
        <!--lucene核心索引包-->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-core</artifactId>
            <version>${lucene.version}</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-queryparser -->
        <!--lucene查询分析-->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-queryparser</artifactId>
            <version>${lucene.version}</version>
        </dependency>


        <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-analyzers-common -->
        <!--lucene把一段文件按照词进行划分-->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-analyzers-common</artifactId>
            <version>${lucene.version}</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-highlighter -->
        <!--lucene高亮显示搜索词-->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-highlighter</artifactId>
            <version>${lucene.version}</version>
        </dependency>

        <!-- 中文分词器 SmartChineseAnalyzer -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-analyzers-smartcn</artifactId>
            <version>${lucene.version}</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-backward-codecs -->
        <!--版本过高，添加如下依赖，不会出现报错-->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-backward-codecs</artifactId>
            <version>${lucene.version}</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/mysql/mysql-connector-java -->
        <!--mysql驱动包-->
        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.47</version>
        </dependency>

        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.18.4</version>
        </dependency>

    </dependencies>


    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>7</source>
                    <target>7</target>
                </configuration>
            </plugin>
        </plugins>
    </build>
</project>

2.创建索引

Indexer.java

package com.wjx.index;

import com.wjx.util.FileUtil;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import java.io.IOException;
import java.nio.file.Paths;

/**
 * @Description: lucene创建索引
 * @Auther: wjx
 * @Date: 2019/2/12 10:14
 */
public class Indexer {

    private Integer ids[] = {1, 2, 3};

    private String cities[] = {"北京", "上海", "南京"};

    private String des[] = {"北京是中国首都。", "上海市中国魔都。", "南京有玄武湖。"};

    private Directory directory;

    /**
     * 读取IndexWriter实例
     *
     * @return
     */
    private IndexWriter getWriter() throws IOException {
        //标准分词器
        //Analyzer analyzer = new StandardAnalyzer();
        //中文分词器
        SmartChineseAnalyzer chineseAnalyzer = new SmartChineseAnalyzer();
        IndexWriterConfig config = new IndexWriterConfig(chineseAnalyzer);
        IndexWriter writer = new IndexWriter(directory, config);
        return writer;
    }

    /**
     * 创建索引
     *
     * @param path
     * @throws IOException
     */
    private void createIndexer(String path) throws IOException {
        //创建directory
        directory = FSDirectory.open(Paths.get(path));
        //获取IndexWriter
        IndexWriter writer = getWriter();
        //创建索引
        for (int i = 0; i < ids.length; i++) {
            Document document = new Document();
            document.add(new StringField("id", ids[i] + "", Field.Store.YES));
            document.add(new StringField("city", cities[i] + "", Field.Store.YES));
            document.add(new TextField("des", des[i] + "", Field.Store.YES));
            writer.addDocument(document);
        }
        writer.close();
    }

    public static void main(String[] args) throws IOException {
        new Indexer().createIndexer(FileUtil.LUCENE_PATH);
        System.out.println("创建索引完成");
    }
}

3.进行搜索

Searcher.java

package com.wjx.index;

import com.wjx.util.FileUtil;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import java.io.StringReader;
import java.nio.file.Paths;

/**
 * @Description: lucene搜索，高亮显示
 * @Auther: wjx
 * @Date: 2019/2/12 10:43
 */
public class Searcher {

    private Directory directory;

    /**
     * 进行查询
     *
     * @param indexPath
     * @param words
     * @throws Exception
     */
    private void search(String indexPath, String words) throws Exception {
        directory = FSDirectory.open(Paths.get(indexPath));
        IndexReader indexReader = DirectoryReader.open(directory);
        IndexSearcher is = new IndexSearcher(indexReader);
        //创建标准分词器替换成中文分词器
        SmartChineseAnalyzer chineseAnalyzer = new SmartChineseAnalyzer();
        //创建搜索解析器
        QueryParser parser = new QueryParser("des", chineseAnalyzer);
        //创建查询条件
        Query query = parser.parse(words);

        long start = System.currentTimeMillis();
        //查询出来的相匹配的前100个结果
        TopDocs hits = is.search(query, 10);
        long end = System.currentTimeMillis();
        System.out.println("匹配 " + words + " ，总共花费" + (end - start) + "毫秒" + "查询到" + hits.totalHits + "个记录");

        /**
         * 对查询出来的结果进行高亮处理
         */

        //获取高亮显示的对象
        Highlighter highlighter = getHighlighter(query);

        ScoreDoc[] scoreDocs = hits.scoreDocs;
        for (ScoreDoc scoreDoc : scoreDocs) {
            //命中结果集的id
            int hitId = scoreDoc.doc;
            //查询出来的对象
            Document document = is.doc(hitId);
            String id = document.get("id");
            String city = document.get("city");
            String des = document.get("des");

            /**
             * 这里是高亮显示部分，如果不高亮实现，
             * 直接输出 System.out.println(des);
             */
            if (des != null) {
                //System.out.println(des);  非高亮显示
                //通过中文分词解析器获取token流，分成很多的片段
                TokenStream tokenStream = chineseAnalyzer.tokenStream("desc", new StringReader(des));
                //高亮显示用中文分词解析器获取得分最高的片段，并不是把所有内容全显示
                System.out.println(highlighter.getBestFragment(tokenStream, des));
            }
        }

    }

    /**
     * 获取高亮显示的对象
     * 标准模板，照着抄下来，只需要修改SimpleHTMLFormatter里面的两个参数前缀和后缀即可
     *
     * @param query
     * @return
     */
    private Highlighter getHighlighter(Query query) {
        //查询出要查询的内容的得分
        QueryScorer queryScorer = new QueryScorer(query);
        //通过得获取要显示的片段
        Fragmenter fragmenter = new SimpleSpanFragmenter(queryScorer);
        //把查询结果转换成html格式预处理，默认是粗体
        SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<b><font color='red'>", "</font></b>");
        //进行高亮处理，第一个参数放置html格式预处理，第二个参数是片段得分 (查询的语汇单元)
        Highlighter highlighter = new Highlighter(simpleHTMLFormatter, queryScorer);
        //设置高亮要显示的字段
        highlighter.setTextFragmenter(fragmenter);
        return highlighter;
    }


    public static void main(String[] args) throws Exception {
        new Searcher().search(FileUtil.LUCENE_PATH, "中国");
    }
}

4.删除修改

删除索引

 public static void main(String[] args) throws Exception {

        //设置索引存放位置
        Directory directory = FSDirectory.open(Paths.get(FileUtil.LUCENE_PATH));
        //标准分词器：英文分词器，后边中文需要转换的中文分词器实现类
        Analyzer analyzer = new StandardAnalyzer();
        //写索引的配置
        IndexWriterConfig config = new IndexWriterConfig(analyzer);
        //索引实例,将索引位置，分词器写到一起
        IndexWriter writer = new IndexWriter(directory, config);
        writer.deleteDocuments(new Term("id","1"));

        //加上这句话，强制删除索引文件，不加删除不掉
        writer.forceMergeDeletes();
        writer.commit();
        writer.close();
    }

修改索引

 public static void main(String[] args) throws Exception {

        //设置索引存放位置
        Directory directory = FSDirectory.open(Paths.get(FileUtil.LUCENE_PATH));
        //标准分词器：英文分词器，后边中文需要转换的中文分词器实现类
        Analyzer analyzer = new StandardAnalyzer();
        //写索引的配置
        IndexWriterConfig config = new IndexWriterConfig(analyzer);
        //索引实例,将索引位置，分词器写到一起
        IndexWriter writer = new IndexWriter(directory, config);
        writer.deleteDocuments(new Term("id", "1"));

        Document document = new Document();
        document.add(new TextField("id", "2", Field.Store.YES));
        document.add(new TextField("name", "my name is lzy", Field.Store.YES));
        document.add(new TextField("age", "26", Field.Store.YES));
        document.add(new StringField("description", "my name is lzy", Field.Store.YES));//StringField不拆分

        writer.updateDocument(new Term("id", "2"), document);
        writer.commit();
        writer.close();
    }

4.一些辅助类

用户实体类 User.java

package com.wjx.pojo;

import lombok.Data;

/**
 * @Description:
 * @Auther: wjx
 * @Date: 2019/2/11 16:11
 */
@Data
public class User {

    private int id;
    private String name;
    private int age;
    private String description;

    public User() {
    }

    public User(int id, String name, int age, String description) {
        this.id = id;
        this.name = name;
        this.age = age;
        this.description = description;
    }
}

文件操作辅助类 FileUtil.java

package com.wjx.util;

/**
 * @Description:
 * @Auther: wjx
 * @Date: 2019/2/12 09:58
 */
public class FileUtil {

    public static final String LUCENE_PATH = "d:\\lucene";
    public static final String SEARCH_FILE_PATH = "D:\\Program Files\\lucene\\data";

}

数据库辅助类 DBUtil.java

package com.wjx.util;


import com.wjx.pojo.User;

import java.sql.*;
import java.util.ArrayList;
import java.util.List;

/**
 * @Description: 数据库工具类
 * @Auther: wjx
 * @Date: 2019/2/11 15:43
 */
public class DBUtil {

    /**
     * 获取Connection连接
     *
     * @return
     */
    private static Connection getConnection() {

        String driver = "com.mysql.jdbc.Driver";
        String url = "jdbc:mysql://114.116.24.32:3306/wjx?useUnicode=true&characterEncoding=UTF-8&useSSL=false";
        String userName = "wjx";
        String password = "123";
        Connection connection = null;

        try {
            //获取上下文
            Class.forName(driver);
            connection = DriverManager.getConnection(url, userName, password);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return connection;
    }

    /**
     * 释放数据库资源
     *
     * @param connection
     * @param statement
     * @param resultSet
     */
    private static void releaseDb(Connection connection, PreparedStatement statement, ResultSet resultSet) {
        try {
            if (connection != null) {
                connection.close();
            }
            if (statement != null) {
                statement.close();
            }
            if (resultSet != null) {
                resultSet.close();
            }
        } catch (SQLException e) {
            e.printStackTrace();
        }
    }

    /**
     * 测试lucene创建索引
     *
     * @return
     */
    public static List<User> getLuceneAll() {

        List<User> userList = new ArrayList<>();
        Connection connection = null;
        String sql = "select * from user";
        PreparedStatement pstm = null;
        ResultSet resultSet = null;

        try {
            connection = getConnection();
            pstm = connection.prepareStatement(sql);
            resultSet = pstm.executeQuery();
            while (resultSet.next()) {
                int id = resultSet.getInt(1);
                String name = resultSet.getString(2);
                int age = resultSet.getInt(3);
                String description = resultSet.getString("description");
                userList.add(new User(id, name, age, description));
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            releaseDb(connection, pstm, resultSet);
        }
        return userList;
    }

    public static void main(String[] args) {
        List<User> luceneAll = getLuceneAll();
        System.out.println(luceneAll);
    }
}

5.把数据库数据提前至索引

package com.wjx.lucene;


import com.wjx.pojo.User;
import com.wjx.util.DBUtil;
import com.wjx.util.FileUtil;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import java.io.IOException;
import java.nio.file.Paths;
import java.util.List;

/**
 * @Description: 将查询出来的mysql添加索引
 * @Auther: wjx
 * @Date: 2019/2/11 11:08
 */
public class Mysql_Writer {


    public static void main(String[] args) throws IOException {


        //设置索引存放位置
        Directory directory = FSDirectory.open(Paths.get(FileUtil.LUCENE_PATH));

        //标准分词器：英文分词器，后边中文需要转换的中文分词器实现类
        Analyzer analyzer = new StandardAnalyzer();
        //写索引的配置
        IndexWriterConfig config = new IndexWriterConfig(analyzer);
        //索引实例,将索引位置，分词器写到一起
        IndexWriter writer = new IndexWriter(directory, config);
        List<User> userList = DBUtil.getLuceneAll();

        for (User user : userList) {
            Document document = new Document();
            document.add(new TextField("id", String.valueOf(user.getId()), Field.Store.YES));
            document.add(new TextField("name", user.getName(), Field.Store.YES));
            document.add(new TextField("age", String.valueOf(user.getAge()), Field.Store.YES));
            document.add(new StringField("description", user.getDescription(), Field.Store.YES));//StringField不拆分
            writer.addDocument(document);
        }
        writer.close();
    }
}