Lucene学习笔记

Luceneapache软件基金会4 jakarta项目组的一个子项目,是一个开放源代码的全文检索引擎工具包,但它不是一个完整的全文检索引擎,而是一个全文检索引擎的架构,提供了完整的查询引擎和索引引擎,部分文本分析引擎(英文与德文两种西方语言)。Lucene的目的是为软件开发人员提供一个简单易用的工具包,以方便的在目标系统中实现全文检索的功能,或者是以此为基础建立起完整的全文检索引擎。Lucene是一套用于全文检索和搜寻的开源程式库,由Apache软件基金会支持和提供。Lucene提供了一个简单却强大的应用程式接口,能够做全文索引和搜寻。在Java开发环境里Lucene是一个成熟的免费开源工具。就其本身而言,Lucene是当前以及最近几年最受欢迎的免费Java信息检索程序库。人们经常提到信息检索程序库,虽然与搜索引擎有关,但不应该将信息检索程序库与搜索引擎相混淆。

 

HelloWorld测试

创建Maven工程,jar包

在pom.xml配置文件中导入lucene的对应jar包

<dependencies>

    <!-- 核心包lucene-core -->

    <dependency>

        <groupId>org.apache.lucene</groupId>

        <artifactId>lucene-core</artifactId>

        <version>5.3.1</version>

    </dependency>

    <!-- 查询解析lucene-queryparser -->

    <dependency>

        <groupId>org.apache.lucene</groupId>

        <artifactId>lucene-queryparser</artifactId>

        <version>5.3.1</version>

    </dependency>

    <!-- 解析器lucene-analyzers-common -->

    <dependency>

        <groupId>org.apache.lucene</groupId>

        <artifactId>lucene-analyzers-common</artifactId>

        <version>5.3.1</version>

    </dependency>

</dependencies>

 

Indexer.java用于创建全文检索的索引

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.TextField;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

 

/**

 *

 * @ClassName: Indexer

 * @Description: TODO (写索引)

 * @author A我去

 * @date 20191024日下午9:26:46

 */

public class Indexer {

   

    private IndexWriter writer; //写索引实例

   

    /**

     * 构造方法,实例化IndexWriter

     * @param indexDir

     * @throws Exception

     */

    public Indexer(String indexDir)throws Exception{

        Directory directory = FSDirectory.open(Paths.get(indexDir));

        Analyzer analyzer = new StandardAnalyzer(); //标准分词器,只对英文有效

        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

        writer = new IndexWriter(directory, iwc);

    }

   

    /**

     *

     * <b>Description</b><br>

     * (关闭写索引)

     * <br>

     * -------------------------------------------------<br>

     * <b>A我去  20191024 下午9:35:48</b>

     */

    public void close()throws Exception{

        writer.close();

    }

   

    /**

     *

     * <b>Description</b><br>

     * (索引指定目录下的所有文件)

     * d:\lucene\data\**

     * <br>

     * -------------------------------------------------<br>

     * <b>A我去  20191024 下午9:37:16</b>

     */

    public int index(String dataDir)throws Exception{

        File[] files = new File(dataDir).listFiles(); //遍历目录下的所有文件

        for(File file : files) {

            indexFile(file);

        }

        //返回索引了多少个文件

        return writer.numRamDocs();

       

    }

 

    /**

     *

     * <b>Description</b><br>

     * (索引指定文件)

     * <br>

     * -------------------------------------------------<br>

     * <b>A我去  20191024 下午9:41:02</b>

     */

    private void indexFile(File file)throws Exception{

        System.out.println("索引文件:"+file.getCanonicalPath());

        Document document = getDocument(file);

        writer.addDocument(document); //添加索引

    }

 

    /**

     *

     * <b>Description</b><br>

     * (获取文档,文档里再设置每个字段)

     * <br>

     * -------------------------------------------------<br>

     * <b>A我去  20191024 下午9:43:32</b>

     */

    private Document getDocument(File file)throws Exception{

        Document document = new Document();

        //内容,不保存内容到索引

        document.add(new TextField("contents", new FileReader(file)));

        //保存文件名到索引

        document.add(new TextField("fileName", file.getName(), Field.Store.YES));

        //保存完整路径到索引

        document.add(new TextField("fullPath", file.getCanonicalPath(), Field.Store.YES));

        return document;

    }

   

    public static void main(String[] args){

        String indexDir = "D:\\lucene"; //索引的输出位置

        String dataDir = "D:\\lucene\\data"; //数据源

        Indexer indexer = null;

        int numIndexed = 0;

        long start = System.currentTimeMillis();

        try {

            indexer = new Indexer(indexDir);

            numIndexed = indexer.index(dataDir);

        } catch (Exception e) {

            e.printStackTrace();

        } finally {

            try {

                indexer.close();

            } catch (Exception e) {

                e.printStackTrace();

            }

        }

        long end = System.currentTimeMillis();

        System.out.println("索引了"+numIndexed+" 个文件,使用了"+(end-start)+" 毫秒");

    }

   

 

}

在D:\lucene\data拷入指定的文件夹和文件

 

Searcher.java用于查询全文检索

import java.nio.file.Paths;

 

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.queryparser.classic.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

 

public class Searcher {

 

    public static void search(String indexDir, String q)throws Exception{

        Directory directory = FSDirectory.open(Paths.get(indexDir));

        IndexReader reader = DirectoryReader.open(directory);

        IndexSearcher is = new IndexSearcher(reader);

        Analyzer analyzer = new StandardAnalyzer(); //标准分词器

        QueryParser parser = new QueryParser("contents", analyzer);

        Query query = parser.parse(q);

       

        long start = System.currentTimeMillis();

        TopDocs hits = is.search(query, 10); //开始查询,得到Top10

        long end = System.currentTimeMillis();

        System.out.println("匹配"+q+",耗时"+(end-start)+"毫秒"+"查询到:"+hits.totalHits+ "条记录");

        for(ScoreDoc scoreDoc : hits.scoreDocs) {

            Document document = is.doc(scoreDoc.doc);

            System.out.println(document.get("fullPath"));

        }

        reader.close();

    }

   

    public static void main(String[] args) {

        String indexDir = "D:\\lucene";

        String q = "Zygmunt Saloni";

        try {

            search(indexDir, q);

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

   

}

 

文档操作

import java.nio.file.Paths;

 

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.StringField;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.Term;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.junit.Before;

import org.junit.Test;

 

public class IndexingTest01 {

 

    private String ids[] = { "1", "2", "3" };

    private String citys[] = { "qingdao", "nanjing", "shanghai" };

    private String descs[] = { "Qingdao is a beautiful city.",

                               "Nanjing is a city of culture.",

                               "Shanghai is a bustling city." };

 

    private Directory directory;

   

    @Before

    public void setUp() throws Exception {

        directory = FSDirectory.open(Paths.get("D:\\lucene"));

        IndexWriter writer = getWriter();

        for(int i=0;i<ids.length;i++) {

            Document document = new Document();

            document.add(new StringField("id", ids[i], Field.Store.YES)); //存索引

            document.add(new StringField("city", citys[i], Field.Store.YES));

            document.add(new StringField("desc", descs[i], Field.Store.NO)); //不存索引

            writer.addDocument(document); //添加文档操作

        }

        writer.close();//关闭

    }

   

    /**

     *

     * <b>Description</b><br>

     * (获取IndexWriter实例)

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191025 下午10:35:36</b>

     */

    private IndexWriter getWriter()throws Exception{

        Analyzer analyzer = new StandardAnalyzer(); //标准分词器

        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

        IndexWriter writer = new IndexWriter(directory, iwc);

        return writer;

    }

   

    /**

     *

     * <b>Description</b><br>

     * (测试写文档的数量)

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191025 下午10:44:05</b>

     */

    @Test

    public void testIndexWriter()throws Exception{

        IndexWriter writer = getWriter();

        System.out.println("写入了:"+writer.numDocs()+" 个文档");

        writer.close();

    }

   

    /**

     *

     * <b>Description</b><br>

     * (测试读取文档)

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191025 下午10:50:09</b>

     */

    @Test

    public void testIndexReader()throws Exception{

        IndexReader reader = DirectoryReader.open(directory);

        System.out.println("最大文档数量:"+reader.maxDoc());

        System.out.println("实际文档数量:"+reader.numDocs());

        reader.close();

    }

   

    /**

     *

     * <b>Description</b><br>

     * (测试删除,在合并前)

     * 只做标记,并没有真正的删除,访问量大的网站采用这种方式,等到空闲的时候再删除

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191025 下午10:57:23</b>

     */

    @Test

    public void testDelBeforeMerge()throws Exception{

        IndexWriter writer = getWriter();

        System.out.println("删除前的文档数量:"+writer.numDocs());

        writer.deleteDocuments(new Term("id","1")); //删除id1

        writer.commit();

        System.out.println("删除后最大文档数量:"+writer.maxDoc());

        System.out.println("删除后实际文档数量:"+writer.numDocs());

    }

   

    /**

     *

     * <b>Description</b><br>

     * (测试删除,在合并后)

     * 真正的删除:大型系统中非常耗时,空闲时间在执行彻底删除

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191025 下午11:04:02</b>

     */

    @Test

    public void testDelAfterMerge()throws Exception{

        IndexWriter writer = getWriter();

        System.out.println("删除前的文档数量:"+writer.numDocs());

        writer.deleteDocuments(new Term("id","1")); //删除id1

        writer.forceMergeDeletes(); //强制合并

        writer.commit();

        System.out.println("删除后最大文档数量:"+writer.maxDoc());

        System.out.println("删除后实际文档数量:"+writer.numDocs());

    }

   

    /**

     *

     * <b>Description</b><br>

     * (更新文档)比较耗时

     * 1.找到id1的文档,2.删除,3.重新创建

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191025 下午11:09:43</b>

     */

    @Test

    public void testUpdate()throws Exception{

        IndexWriter writer = getWriter();

        Document document = new Document();

        document.add(new StringField("id", "1", Field.Store.YES)); //存索引

        document.add(new StringField("city", "qingdao", Field.Store.YES));

        document.add(new StringField("desc", "xXXxXXXxXX", Field.Store.NO)); //不存索引

        writer.updateDocument(new Term("id","1"), document);

        writer.close();

    }

 

}

 

文档域加权(可以使搜索排名提高)

import java.nio.file.Paths;

 

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.StringField;

import org.apache.lucene.document.TextField;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.index.Term;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TermQuery;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.junit.Test;

 

public class IndexingTest02 {

 

    private String ids[]={"1","2","3","4"};

    private String authors[]={"Jack","Marry","John","Json"};

    private String positions[]={"accounting","technician","salesperson","boss"};

    private String titles[]={"Java is a good language.","Java is a cross platform language","Java powerful","You should learn java"};

    private String contents[]={

        "If possible, use the same JRE major version at both index and search time.",

        "When upgrading to a different JRE major version, consider re-indexing. ",

        "Different JRE major versions may implement different versions of Unicode,",

        "For example: with Java 1.4, `LetterTokenizer` will split around the character U+02C6,"

    };

   

    private Directory directory;

   

    /**

     *

     * <b>Description</b><br>

     * (获取IndexWriter实例)

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191025 下午11:22:40</b>

     */

    private IndexWriter getWriter()throws Exception{

        Analyzer analyzer = new StandardAnalyzer(); //标准分词器

        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

        IndexWriter writer = new IndexWriter(directory, iwc);

        return writer;

    }

   

    /**

     *

     * <b>Description</b><br>

     * (生成索引)

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191025 下午11:24:47</b>

     */

    @Test

    public void index()throws Exception{

        directory = FSDirectory.open(Paths.get("D:\\lucene"));

        IndexWriter writer = getWriter();

        for(int i=0;i<ids.length;i++) {

            Document document = new Document();

            //不需要分词器解析的使用StringField

            document.add(new StringField("id", ids[i], Field.Store.YES)); //YES存索引

            document.add(new StringField("author", authors[i], Field.Store.YES));

            document.add(new StringField("position", positions[i], Field.Store.YES));

            //需要分词解析的使用TextField

            document.add(new TextField("title", titles[i], Field.Store.YES));

            document.add(new TextField("content", contents[i], Field.Store.NO));

            writer.addDocument(document); //添加文档操作

        }

        writer.close();//关闭

    }

   

    /**

     *

     * <b>Description</b><br>

     * (生成索引,并给BOSS的标题加权,提高排名)

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191025 下午11:24:47</b>

     */

    @Test

    public void indexBoost()throws Exception{

        directory = FSDirectory.open(Paths.get("D:\\lucene"));

        IndexWriter writer = getWriter();

        for(int i=0;i<ids.length;i++) {

            Document document = new Document();

            //不需要分词器解析的使用StringField

            document.add(new StringField("id", ids[i], Field.Store.YES)); //YES存索引

            document.add(new StringField("author", authors[i], Field.Store.YES));

            document.add(new StringField("position", positions[i], Field.Store.YES));

            //需要分词解析的使用TextField

            TextField field = new TextField("title", titles[i], Field.Store.YES);

            if("boss".equals(positions[i])) {

                //默认为1f,高于1f加权,低于1f减权

                field.setBoost(1.5f);

            }

            document.add(field);

            document.add(new TextField("content", contents[i], Field.Store.NO));

            writer.addDocument(document); //添加文档操作

        }

        writer.close();//关闭

    }

   

    /**

     *

     * <b>Description</b><br>

     * (搜索)

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191025 下午11:32:57</b>

     */

    @Test

    public void search()throws Exception{

        directory = FSDirectory.open(Paths.get("D:\\lucene"));

        IndexReader reader = DirectoryReader.open(directory);

        IndexSearcher iSearcher = new IndexSearcher(reader);

       

        String searchField = "title"; //查询的字段

        String q = "java"; //查询的内容

        Term term = new Term(searchField, q);

        Query query = new TermQuery(term);

        TopDocs hits = iSearcher.search(query, 10); //10为查询的条数

        System.out.println("匹配"+q+",总共查询到"+hits.totalHits+"个文档");

        for(ScoreDoc scoreDoc : hits.scoreDocs) {

            Document document = iSearcher.doc(scoreDoc.doc);

            System.out.println(document.get("author"));

        }

        reader.close();

    }

   

}

 

查询功能

创建Maven项目,jar包,导入对应的jar依赖

pom.xml文件

<dependencies>

    <!-- 核心包lucene-core -->

    <dependency>

        <groupId>org.apache.lucene</groupId>

        <artifactId>lucene-core</artifactId>

        <version>5.3.1</version>

    </dependency>

    <!-- 查询解析lucene-queryparser -->

    <dependency>

        <groupId>org.apache.lucene</groupId>

        <artifactId>lucene-queryparser</artifactId>

        <version>5.3.1</version>

    </dependency>

    <!-- 解析器lucene-analyzers-common -->

    <dependency>

        <groupId>org.apache.lucene</groupId>

        <artifactId>lucene-analyzers-common</artifactId>

        <version>5.3.1</version>

    </dependency>

    <!-- junit -->

    <dependency>

        <groupId>junit</groupId>

        <artifactId>junit</artifactId>

        <version>4.12</version>

        <scope>test</scope>

    </dependency>

</dependencies>

 

创建包com.kingsoft.lucene

创建Indexer.java用于写索引

/**

 *

 * @ClassName: Indexer

 * @Description: TODO (写索引)

 * @author A我去

 * @date 20191024日下午9:26:46

 */

public class Indexer {

   

    private IndexWriter writer; //写索引实例

   

    /**

     * 构造方法,实例化IndexWriter

     * @param indexDir

     * @throws Exception

     */

    public Indexer(String indexDir)throws Exception{

        Directory directory = FSDirectory.open(Paths.get(indexDir));

        Analyzer analyzer = new StandardAnalyzer(); //标准分词器,只对英文有效

        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

        writer = new IndexWriter(directory, iwc);

    }

   

    /**

     *

     * <b>Description</b><br>

     * (关闭写索引)

     * <br>

     * -------------------------------------------------<br>

     * <b>A我去  20191024 下午9:35:48</b>

     */

    public void close()throws Exception{

        writer.close();

    }

   

    /**

     *

     * <b>Description</b><br>

     * (索引指定目录下的所有文件)

     * d:\lucene\data\**

     * <br>

     * -------------------------------------------------<br>

     * <b>A我去  20191024 下午9:37:16</b>

     */

    public int index(String dataDir)throws Exception{

        File[] files = new File(dataDir).listFiles(); //遍历目录下的所有文件

        for(File file : files) {

            indexFile(file);

        }

       

        //返回索引了多少个文件

        return writer.numRamDocs();

    }

 

    /**

     *

     * <b>Description</b><br>

     * (索引指定文件)

     * <br>

     * -------------------------------------------------<br>

     * <b>A我去  20191024 下午9:41:02</b>

     */

    private void indexFile(File file)throws Exception{

        System.out.println("索引文件:"+file.getCanonicalPath());

        Document document = getDocument(file);

        writer.addDocument(document); //添加索引

    }

 

    /**

     *

     * <b>Description</b><br>

     * (获取文档,文档里再设置每个字段)

     * <br>

     * -------------------------------------------------<br>

     * <b>A我去  20191024 下午9:43:32</b>

     */

    private Document getDocument(File file)throws Exception{

        Document document = new Document();

        //内容,不保存内容到索引

        document.add(new TextField("contents", new FileReader(file)));

        //保存文件名到索引

        document.add(new TextField("fileName", file.getName(), Field.Store.YES));

        //保存完整路径到索引

        document.add(new TextField("fullPath", file.getCanonicalPath(), Field.Store.YES));

        return document;

    }

   

    public static void main(String[] args){

        String indexDir = "D:\\lucene"; //索引的输出位置

        String dataDir = "D:\\lucene\\data"; //数据源

        Indexer indexer = null;

        int numIndexed = 0;

        long start = System.currentTimeMillis();

        try {

            indexer = new Indexer(indexDir);

            numIndexed = indexer.index(dataDir);

        } catch (Exception e) {

            e.printStackTrace();

        } finally {

            try {

                indexer.close();

            } catch (Exception e) {

                e.printStackTrace();

            }

        }

        long end = System.currentTimeMillis();

        System.out.println("索引了"+numIndexed+" 个文件,使用了"+(end-start)+" 毫秒");

    }

   

 

}

 

 

D盘创建lucene\data文件夹,并在data文件夹中创建多个文本文件用于测试使用

 

创建SearchTest.java用于测试多种搜索方法

import java.nio.file.Paths;

 

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.Term;

import org.apache.lucene.queryparser.classic.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TermQuery;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.junit.After;

import org.junit.Before;

import org.junit.Test;

 

public class SearchTest {

 

    private Directory directory;

    private IndexReader reader;

    private IndexSearcher is;

   

    @Before

    public void setUp()throws Exception{

        directory = FSDirectory.open(Paths.get("D:\\lucene"));

        reader = DirectoryReader.open(directory);

        is = new IndexSearcher(reader);

    }

   

    @After

    public void tearDown()throws Exception{

        reader.close();

    }

   

    /**

     *

     * <b>Description</b><br>

     * (对特定项进行搜索,不常用)

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191026 下午9:48:08</b>

     */

    @Test

    public void testTermQuery()throws Exception{

        String searchField = "contents";

        String q = "particular";

        Term term = new Term(searchField, q);

        Query query = new TermQuery(term);

        TopDocs hits = is.search(query, 10);

        System.out.println("匹配"+q+", 共查询到"+hits.totalHits+"个文档");

        for(ScoreDoc scoreDoc : hits.scoreDocs) {

            Document document = is.doc(scoreDoc.doc);

            System.out.println(document.get("fullPath"));

        }

    }

   

    /**

     *

     * <b>Description</b><br>

     * (解析查询表达式,使用QueryParser比较常用)

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191026 下午9:58:59</b>

     */

    @Test

    public void testQueryParser()throws Exception{

        String searchField = "contents";

        String q = "particular";

        Analyzer analyzer = new StandardAnalyzer(); //标准分词器

        QueryParser parser = new QueryParser(searchField, analyzer);

        Query query = parser.parse(q);

        TopDocs hits = is.search(query, 10);

        System.out.println("匹配"+q+", 共查询到"+hits.totalHits+"个文档");

        for(ScoreDoc scoreDoc : hits.scoreDocs) {

            Document document = is.doc(scoreDoc.doc);

            System.out.println(document.get("fullPath"));

        }

    }

   

    /**

     *

     * <b>Description</b><br>

     * (解析查询表达式--加入条件)

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191026 下午10:05:18</b>

     */

    @Test

    public void testQueryParserOR()throws Exception{

        String searchField = "contents";

        //或者使用particular Unicode也可以表示或的关系

        String q = "particular or Unicode";

        Analyzer analyzer = new StandardAnalyzer(); //标准分词器

        QueryParser parser = new QueryParser(searchField, analyzer);

        Query query = parser.parse(q);

        TopDocs hits = is.search(query, 10);

        System.out.println("匹配"+q+", 共查询到"+hits.totalHits+"个文档");

        for(ScoreDoc scoreDoc : hits.scoreDocs) {

            Document document = is.doc(scoreDoc.doc);

            System.out.println(document.get("fullPath"));

        }

    }

   

    /**

     *

     * <b>Description</b><br>

     * (解析查询表达式--加入条件)

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191026 下午10:05:18</b>

     */

    @Test

    public void testQueryParserAND()throws Exception{

        String searchField = "contents";

        //要使用大写的AND关联两个字符

        String q = "particular AND benchmarks";

        Analyzer analyzer = new StandardAnalyzer(); //标准分词器

        QueryParser parser = new QueryParser(searchField, analyzer);

        Query query = parser.parse(q);

        TopDocs hits = is.search(query, 10);

        System.out.println("匹配"+q+", 共查询到"+hits.totalHits+"个文档");

        for(ScoreDoc scoreDoc : hits.scoreDocs) {

            Document document = is.doc(scoreDoc.doc);

            System.out.println(document.get("fullPath"));

        }

    }

   

    /**

     *

     * <b>Description</b><br>

     * (解析查询表达式--模糊查询)

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191026 下午10:05:18</b>

     */

    @Test

    public void testQueryParserBlurry()throws Exception{

        String searchField = "contents";

        //要使用~进行模糊查询

        String q = "part~";

        Analyzer analyzer = new StandardAnalyzer(); //标准分词器

        QueryParser parser = new QueryParser(searchField, analyzer);

        Query query = parser.parse(q);

        TopDocs hits = is.search(query, 10);

        System.out.println("匹配"+q+", 共查询到"+hits.totalHits+"个文档");

        for(ScoreDoc scoreDoc : hits.scoreDocs) {

            Document document = is.doc(scoreDoc.doc);

            System.out.println(document.get("fullPath"));

        }

    }  

}

 

 

Lucene的其他查询方式

在com.kingsoft.lucene包下创建Indexer.java

public class Indexer {

 

    private Integer ids[] = { 1, 2, 3 };

    private String citys[] = { "aingdao", "nanjing", "shanghai" };

    private String descs[] = { "Qingdao is a beautiful city.",

            "Nanjing is b city of culture.",

            "Shanghai is c bustling city." };

   

    private Directory directory;

   

    /**

     *

     * <b>Description</b><br>

     * (获取IndexWriter实例)

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191025 下午10:35:36</b>

     */

    private IndexWriter getWriter()throws Exception{

        Analyzer analyzer = new StandardAnalyzer(); //标准分词器

        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

        IndexWriter writer = new IndexWriter(directory, iwc);

        return writer;

    }

   

    /**

     *

     * <b>Description</b><br>

     * (生成索引)

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191027 下午9:01:21</b>

     */

    private void index(String indexDir)throws Exception{

        directory = FSDirectory.open(Paths.get(indexDir));

        IndexWriter writer = getWriter();

        for(int i=0;i<ids.length;i++) {

            Document document = new Document();

            document.add(new IntField("id", ids[i], Field.Store.YES));

            document.add(new StringField("city", citys[i], Field.Store.YES));

            document.add(new StringField("desc", descs[i], Field.Store.YES));

            writer.addDocument(document); //添加文档

        }

        writer.close();

    }

   

    public static void main(String[] args)throws Exception{

        new Indexer().index("D:\\lucene");

    }

   

 

}

 

创建测试类SearchTest.java

import org.apache.lucene.document.Document;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.index.Term;

import org.apache.lucene.search.BooleanClause;

import org.apache.lucene.search.BooleanQuery;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.NumericRangeQuery;

import org.apache.lucene.search.PrefixQuery;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.junit.After;

import org.junit.Before;

import org.junit.Test;

 

public class SearchTest {

 

    private Directory directory;

    private IndexReader reader;

    private IndexSearcher is;

 

    @Before

    public void setUp() throws Exception {

        directory = FSDirectory.open(Paths.get("D:\\lucene"));

        reader = DirectoryReader.open(directory);

        is = new IndexSearcher(reader);

    }

 

    @After

    public void tearDown() throws Exception {

        reader.close();

    }

 

    /**

     *

     * <b>Description</b><br>

     * (指定数字范围搜索)

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191027 下午9:47:10</b>

     */

    @Test

    public void testNumRangeQuery()throws Exception{

//字段为id,从1开始到2,是否包含开始,是否包含结束

        NumericRangeQuery<Integer> query = NumericRangeQuery

.newIntRange("id", 1, 2, true, true);

        TopDocs hits = is.search(query, 10);

        for(ScoreDoc scoreDoc : hits.scoreDocs) {

            Document document = is.doc(scoreDoc.doc);

            System.out.println(document.get("id"));

            System.out.println(document.get("city"));

            System.out.println(document.get("desc"));

        }

    }

   

    /**

     *

     * <b>Description</b><br>

     * (指定字符串开头搜素)

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191027 下午10:12:08</b>

     */

    @Test

    public void testPreFixQuery()throws Exception{

             //查询字段为city,开头为a

        PrefixQuery query = new PrefixQuery(new Term("city","a"));     

TopDocs hits = is.search(query, 10);

        for(ScoreDoc scoreDoc : hits.scoreDocs) {

            Document document = is.doc(scoreDoc.doc);

            System.out.println(document.get("id"));

            System.out.println(document.get("city"));

            System.out.println(document.get("desc"));

        }

    }

   

    /**

     *

     * <b>Description</b><br>

     * (多条件组合搜素)

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191027 下午10:16:39</b>

     */

    @Test

    public void testBooleanQuery()throws Exception{

        NumericRangeQuery<Integer> query1 = NumericRangeQuery

.newIntRange("id", 1, 2, true, true);

        PrefixQuery query2 = new PrefixQuery(new Term("city","a"));

        BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder();

        /**

         * Occur.MUST == AND关系

         * Occur.SHOULD == OR关系

         * Occur.MUST_NOT == 不包含

         */

        booleanQuery.add(query1, BooleanClause.Occur.MUST);

        booleanQuery.add(query2, BooleanClause.Occur.MUST);

        TopDocs hits = is.search(booleanQuery.build(), 10);

        for(ScoreDoc scoreDoc : hits.scoreDocs) {

            Document document = is.doc(scoreDoc.doc);

            System.out.println(document.get("id"));

            System.out.println(document.get("city"));

            System.out.println(document.get("desc"));

        }

    }

 

}

 

中文分词,高亮显示

创建maven项目,jar文件

<dependencies>

    <!-- 核心包lucene-core -->

    <dependency>

        <groupId>org.apache.lucene</groupId>

        <artifactId>lucene-core</artifactId>

        <version>5.3.1</version>

    </dependency>

    <!-- 查询解析lucene-queryparser -->

    <dependency>

        <groupId>org.apache.lucene</groupId>

        <artifactId>lucene-queryparser</artifactId>

        <version>5.3.1</version>

    </dependency>

    <!-- 解析器lucene-analyzers-common -->

    <dependency>

        <groupId>org.apache.lucene</groupId>

        <artifactId>lucene-analyzers-common</artifactId>

        <version>5.3.1</version>

    </dependency>

    <!-- 中文分词器lucene-analyzers-smartcn -->

    <dependency>

        <groupId>org.apache.lucene</groupId>

        <artifactId>lucene-analyzers-smartcn</artifactId>

        <version>5.3.1</version>

    </dependency>

    <!-- 高亮显示lucene-highlighter -->

    <dependency>

        <groupId>org.apache.lucene</groupId>

        <artifactId>lucene-highlighter</artifactId>

        <version>5.3.1</version>

    </dependency>

    <!-- 测试junit -->

    <dependency>

        <groupId>junit</groupId>

        <artifactId>junit</artifactId>

        <version>4.12</version>

        <scope>test</scope>

    </dependency>

</dependencies>

 

创建Indexer.java创建索引

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.document.IntField;

import org.apache.lucene.document.StringField;

import org.apache.lucene.document.TextField;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.index.IndexWriterConfig;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

 

public class Indexer {

 

    private Integer ids[] = { 1, 2, 3 };

    private String citys[] = {"青岛","南京","上海"};

    private String descs[] = { "青岛是一个美丽的城市。",

            "南京是一个有文化的城市。",

            "上海是一个繁华的城市。" };

   

    private Directory directory;

   

    /**

     *

     * <b>Description</b><br>

     * (获取IndexWriter实例)

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191025 下午10:35:36</b>

     */

    private IndexWriter getWriter()throws Exception{

        SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();

        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

        IndexWriter writer = new IndexWriter(directory, iwc);

        return writer;

    }

   

    /**

     *

     * <b>Description</b><br>

     * (生成索引)

     * <br>

     * -------------------------------------------------<br>

     * <b>王欢  20191027 下午9:01:21</b>

     */

    private void index(String indexDir)throws Exception{

        directory = FSDirectory.open(Paths.get(indexDir));

        IndexWriter writer = getWriter();

        for(int i=0;i<ids.length;i++) {

            Document document = new Document();

            document.add(new IntField("id", ids[i], Field.Store.YES));

            document.add(new StringField("city", citys[i], Field.Store.YES));

            document.add(new TextField("desc", descs[i], Field.Store.YES));

            writer.addDocument(document); //添加文档

        }

        writer.close();

    }

   

    public static void main(String[] args)throws Exception{

        new Indexer().index("D:\\lucene");

    }

}

 

创建Searcher.java

import java.io.StringReader;

import java.nio.file.Paths;

 

import org.apache.lucene.analysis.TokenStream;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.index.DirectoryReader;

import org.apache.lucene.index.IndexReader;

import org.apache.lucene.queryparser.classic.QueryParser;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

import org.apache.lucene.search.highlight.Fragmenter;

import org.apache.lucene.search.highlight.Highlighter;

import org.apache.lucene.search.highlight.QueryScorer;

import org.apache.lucene.search.highlight.SimpleHTMLFormatter;

import org.apache.lucene.search.highlight.SimpleSpanFragmenter;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

 

public class Searcher {

 

    public static void search(String indexDir, String q)throws Exception{

        Directory directory = FSDirectory.open(Paths.get(indexDir));

        IndexReader reader = DirectoryReader.open(directory);

        IndexSearcher is = new IndexSearcher(reader);

        SmartChineseAnalyzer analyzer = new SmartChineseAnalyzer();

        QueryParser parser = new QueryParser("desc", analyzer);

        Query query = parser.parse(q);

        long start = System.currentTimeMillis();

        TopDocs hits = is.search(query, 10); //开始查询,得到Top10

        long end = System.currentTimeMillis();

        System.out.println("匹配"+q+",耗时"+(end-start)+"毫秒"+"查询到:"+hits.totalHits+ "条记录");

        /**

         * 得到关键字加粗,字体为红色

         */

        QueryScorer scorer = new QueryScorer(query); //计算得分,把得分高的片段显示出来

        Fragmenter fragmenter = new SimpleSpanFragmenter(scorer); //片段

        SimpleHTMLFormatter simple = new SimpleHTMLFormatter("<b><font color='red'>","</font></b>");

        Highlighter highlighter = new Highlighter(simple, scorer);

        highlighter.setTextFragmenter(fragmenter);

       

        for(ScoreDoc scoreDoc : hits.scoreDocs) {

            Document document = is.doc(scoreDoc.doc);

            System.out.println(document.get("city"));

            System.out.println(document.get("desc"));

            String desc = document.get("desc");

            if(desc!=null) {

                //得到权重最高的摘要

                TokenStream tokenStream = analyzer.tokenStream("desc", new StringReader(desc));

                //高亮显示

                String bestFragment = highlighter.getBestFragment(tokenStream, desc);

                System.out.println(bestFragment);

            }

        }

        reader.close();

    }

   

    public static void main(String[] args) {

        String indexDir = "D:\\lucene";

        String q = "南京文化";

        try {

            search(indexDir, q);

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

   

}

 

查询的结果是以html格式

匹配南京文化,耗时13毫秒查询到:1条记录

南京

南京是一个有文化的城市。

<b><font color='red'>南京</font></b>是一个有<b><font color='red'>文化</font></b>的城市。

本文只做自己的学习记录,不喜勿喷

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值