Lucene入门

最新推荐文章于 2024-11-14 19:49:43 发布

tuiyun

最新推荐文章于 2024-11-14 19:49:43 发布

阅读量379

点赞数

分类专栏： j2ee 文章标签： java Java JAVA lucene Lucene 全文检索数据库

j2ee 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

为什么不用数据库提供的供全文检索功能呢？

由于数据库索引不是为全文索引设计的，因此，使用like "%keyword%"时，数据库索引是不起作用的，在使用like查询时，搜索过程又变成类似于一页页翻书的遍历过程了，所以对于含有模糊查询的数据库服务来说，LIKE对性能的危害是极大的。如果是需要对多个关键词进行模糊匹配：like"%keyword1%" and like "%keyword2%" ...其效率也就可想而知了。

所以建立一个高效检索系统的关键是建立一个类似于科技索引一样的反向索引机制，将数据源（比如多篇文章）排序顺序存储的同时，有另外一个排好序的关键词列表，用于存储关键词==>文章映射关系，利用这样的映射关系索引：[关键词==>出现关键词的文章编号，出现次数（甚至包括位置：起始偏移量，结束偏移量），出现频率]，检索过程就是把模糊查询变成多个可以利用索引的精确查询的逻辑组合的过程。从而大大提高了多关键词查询的效率，所以，全文检索问题归结到最后是一个排序问题。

由此可以看出模糊查询相对数据库的精确查询是一个非常不确定的问题，这也是大部分数据库对全文检索支持有限的原因。Lucene最核心的特征是通过特殊的索引结构实现了传统数据库不擅长的全文索引机制，并提供了扩展接口，以方便针对不同应用的定制。

和数据库全文检索相比，Lucene的创新之处：

大部分的搜索（数据库）引擎都是用B树结构来维护索引，索引的更新会导致大量的IO操作，Lucene在实现中，对此稍微有所改进：不是维护一个索引文件，而是在扩展索引的时候不断创建新的索引文件，然后定期的把这些新的小索引文件合并到原先的大索引中（针对不同的更新策略，批次的大小可以调整），这样在不影响检索的效率的前提下，提高了索引的效率。

<o:p> </o:p>

另外Lucene还有两个优点：

（1）Apache Lucene是一个开放源程序的搜寻器引擎

（2）可以轻易地为Java软件加入全文搜寻功能。

Lucene的目的是为软件开发人员提供一个简单易用的工具包，以方便的在目标系统中实现全文检索的功能，或者是以此为基础建立起完整的全文检索引擎。

1、新建一个java工程，导入Lucene所需jar，如下图

目录结构

其中：luceneds为数据源存储位置，luceneindex存放索引文件的位置，即索引库。如果索引库已被创建，那么luceneindex目录下会有索引文件，如下图：

代码

  
  
   
   package
   
    com.ljq.lucene;


   
   import
   
    java.io.BufferedReader;

   
   import
   
    java.io.File;

   
   import
   
    java.io.FileInputStream;

   
   import
   
    java.io.InputStreamReader;


   
   import
   
    org.apache.lucene.analysis.Analyzer;

   
   import
   
    org.apache.lucene.analysis.standard.StandardAnalyzer;

   
   import
   
    org.apache.lucene.document.Document;

   
   import
   
    org.apache.lucene.document.Field;

   
   import
   
    org.apache.lucene.document.NumberTools;

   
   import
   
    org.apache.lucene.document.Field.Index;

   
   import
   
    org.apache.lucene.document.Field.Store;

   
   import
   
    org.apache.lucene.index.IndexWriter;

   
   import
   
    org.apache.lucene.index.IndexWriter.MaxFieldLength;

   
   import
   
    org.apache.lucene.queryParser.MultiFieldQueryParser;

   
   import
   
    org.apache.lucene.queryParser.QueryParser;

   
   import
   
    org.apache.lucene.search.Filter;

   
   import
   
    org.apache.lucene.search.IndexSearcher;

   
   import
   
    org.apache.lucene.search.Query;

   
   import
   
    org.apache.lucene.search.ScoreDoc;

   
   import
   
    org.apache.lucene.search.TopDocs;

   
   import
   
    org.junit.Test;


   
   /**
   
   
 * 开发lucene步骤：先创建索引，再搜索
 * 
 * 
   
   @author
   
    jiqinlin
 *
 
   
   */
   
   

   
   public
   
    
   
   class
   
    HelloWorld {
    
   
   //
   
    数据源路径
   
   

   
       String dspath 
   
   =
   
    
   
   "
   
   F:\\android\\luceneprj\\luceneds\\IndexWriter addDocument's a javadoc .txt
   
   "
   
   ;
    
   
   //
   
   存放索引文件的位置，即索引库
   
   

   
       String indexpath 
   
   =
   
    
   
   "
   
   F:\\android\\luceneprj\\luceneindex
   
   "
   
   ;
    
   
   //
   
   分词器
   
   

   
       Analyzer analyzer 
   
   =
   
    
   
   new
   
    StandardAnalyzer();

    
   
   /**
   
   
     * 创建索引
     * 
     * IndexWriter 用来操作（增、删、改）索引库的
     
   
   */
   
   
    @Test
    
   
   public
   
    
   
   void
   
    createIndex() 
   
   throws
   
    Exception {
        File file 
   
   =
   
    
   
   new
   
    File(dspath);
        
   
   //
   
   Document存放经过组织后的数据源，只有转换为Document对象才可以被索引和搜索到
   
   

   
           Document doc 
   
   =
   
    
   
   new
   
    Document();
        
   
   //
   
   文件名称
   
   

   
           doc.add(
   
   new
   
    Field(
   
   "
   
   name
   
   "
   
   , file.getName(), Store.YES, Index.ANALYZED));
        
   
   //
   
   检索到的内容
   
   

   
           doc.add(
   
   new
   
    Field(
   
   "
   
   content
   
   "
   
   , readFileContent(file), Store.YES, Index.ANALYZED));
        
   
   //
   
   文件大小
   
   

   
           doc.add(
   
   new
   
    Field(
   
   "
   
   size
   
   "
   
   , NumberTools.longToString(file.length()),
                Store.YES, Index.NOT_ANALYZED));
        
   
   //
   
   检索到的文件位置
   
   

   
           doc.add(
   
   new
   
    Field(
   
   "
   
   path
   
   "
   
   , file.getAbsolutePath(), Store.YES, Index.NOT_ANALYZED));

        
   
   //
   
    建立索引
   
   

   
           IndexWriter indexWriter 
   
   =
   
    
   
   new
   
    IndexWriter(indexpath, analyzer, 
   
   true
   
   ,
                MaxFieldLength.LIMITED);
        indexWriter.addDocument(doc);
        indexWriter.close();
    }

    
   
   /**
   
   
     * 搜索
     * 
     * IndexSearcher 用来在索引库中进行查询
     
   
   */
   
   
    @Test
    
   
   public
   
    
   
   void
   
    search() 
   
   throws
   
    Exception {
        
   
   //
   
   请求字段
        
   
   //
   
   String queryString = "document";
   
   

   
           String queryString 
   
   =
   
    
   
   "
   
   adddocument
   
   "
   
   ;

        
   
   //
   
    1，把要搜索的文本解析为 Query
   
   

   
           String[] fields 
   
   =
   
    { 
   
   "
   
   name
   
   "
   
   , 
   
   "
   
   content
   
   "
   
    };
        QueryParser queryParser 
   
   =
   
    
   
   new
   
    MultiFieldQueryParser(fields, analyzer);
        Query query 
   
   =
   
    queryParser.parse(queryString);

        
   
   //
   
    2，进行查询，从索引库中查找
   
   

   
           IndexSearcher indexSearcher 
   
   =
   
    
   
   new
   
    IndexSearcher(indexpath);
        Filter filter 
   
   =
   
    
   
   null
   
   ;
        TopDocs topDocs 
   
   =
   
    indexSearcher.search(query, filter, 
   
   10000
   
   );
        System.out.println(
   
   "
   
   总共有【
   
   "
   
    
   
   +
   
    topDocs.totalHits 
   
   +
   
    
   
   "
   
   】条匹配结果
   
   "
   
   );

        
   
   //
   
    3，打印结果
   
   

   
           
   
   for
   
    (ScoreDoc scoreDoc : topDocs.scoreDocs) {
            
   
   //
   
    文档内部编号
   
   

   
               
   
   int
   
    index 
   
   =
   
    scoreDoc.doc; 
            
   
   //
   
    根据编号取出相应的文档
   
   

   
               Document doc 
   
   =
   
    indexSearcher.doc(index);
            System.out.println(
   
   "
   
   ------------------------------
   
   "
   
   );
            System.out.println(
   
   "
   
   name = 
   
   "
   
    
   
   +
   
    doc.get(
   
   "
   
   name
   
   "
   
   ));
            System.out.println(
   
   "
   
   content = 
   
   "
   
    
   
   +
   
    doc.get(
   
   "
   
   content
   
   "
   
   ));
            System.out.println(
   
   "
   
   size = 
   
   "
   
    
   
   +
   
    NumberTools.stringToLong(doc.get(
   
   "
   
   size
   
   "
   
   )));
            System.out.println(
   
   "
   
   path = 
   
   "
   
    
   
   +
   
    doc.get(
   
   "
   
   path
   
   "
   
   ));
        }
    }

    
   
   /**
   
   
     * 读取文件内容
     
   
   */
   
   
    
   
   public
   
    
   
   static
   
    String readFileContent(File file) {
        
   
   try
   
    {
            BufferedReader reader 
   
   =
   
    
   
   new
   
    BufferedReader(
   
   new
   
    InputStreamReader(
   
   new
   
    FileInputStream(file)));
            StringBuffer content 
   
   =
   
    
   
   new
   
    StringBuffer();
            
   
   for
   
    (String line 
   
   =
   
    
   
   null
   
   ; (line 
   
   =
   
    reader.readLine()) 
   
   !=
   
    
   
   null
   
   ;) {
                content.append(line).append(
   
   "
   
   \n
   
   "
   
   );
            }
            reader.close();
            
   
   return
   
    content.toString();
        } 
   
   catch
   
    (Exception e) {
            
   
   throw
   
    
   
   new
   
    RuntimeException(e);
        }
    }
}