【Lucene4.8教程之一】使用Lucene4.8进行索引及搜索的基本操作

最新推荐文章于 2019-11-08 17:50:31 发布

yangzongzhuan

最新推荐文章于 2019-11-08 17:50:31 发布

阅读量755

点赞数

分类专栏： L1_Lucene

L1_Lucene 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

在Lucene对文本进行处理的过程中，可以大致分为三大部分：

1、索引文件：提取文档内容并分析，生成索引

2、搜索内容：搜索索引内容，根据搜索关键字得出搜索结果

3、分析内容：对搜索词汇进行分析，生成Quey对象。

注：事实上，除了最基本的完全匹配搜索以外，其它都需要在搜索前进行分析。

如不加分析步骤，则搜索JAVA，是没有结果的，因为在索引过程中已经将词汇均转化为小写，而此处搜索时则要求关键字完全匹配。

使用了QueryParser类以后，则根据Analyzer的具体实现类，对搜索词汇进行分析，如大小写转换，java and ant等的搜索词解释等。

一、索引文件

基本步骤如下：

1、创建索引库IndexWriter

2、根据文件创建文档Document

3、向索引库中写入文档内容

[java]view plaincopy 
   
 package com.ljh.search.index;  
   
 import java.io.File;  
 import java.io.FileReader;  
 import java.io.IOException;  
   
 import org.apache.lucene.analysis.standard.StandardAnalyzer;  
 import org.apache.lucene.document.Document;  
 import org.apache.lucene.document.Field;  
 import org.apache.lucene.document.LongField;  
 import org.apache.lucene.document.StringField;  
 import org.apache.lucene.document.TextField;  
 import org.apache.lucene.index.IndexWriter;  
 import org.apache.lucene.index.IndexWriterConfig;  
 import org.apache.lucene.store.Directory;  
 import org.apache.lucene.store.FSDirectory;  
 import org.apache.lucene.util.Version;  
   
 // 1、创建索引库IndexWriter  
 // 2、根据文件创建文档Document  
 // 3、向索引库中写入文档内容  
   
 public class IndexFiles {  
   
     public static void main(String[] args) throws IOException {  
   
         String usage = "java IndexFiles"  
                 + " [-index INDEX_PATH] [-docs DOCS_PATH] \n\n"  
                 + "This indexes the documents in DOCS_PATH, creating a Lucene index"  
                 + "in INDEX_PATH that can be searched with SearchFiles";  
   
         String indexPath = null;  
         String docsPath = null;  
         for (int i = 0; i < args.length; i++) {  
             if ("-index".equals(args[i])) {  
                 indexPath = args[i + 1];  
                 i++;  
             } else if ("-docs".equals(args[i])) {  
                 docsPath = args[i + 1];  
                 i++;  
             }  
         }  
   
         if (docsPath == null) {  
             System.err.println("Usage: " + usage);  
             System.exit(1);  
         }  
   
         final File docDir = new File(docsPath);  
         if (!docDir.exists() || !docDir.canRead()) {  
             System.out  
                     .println("Document directory '"  
                             + docDir.getAbsolutePath()  
                             + "' does not exist or is not readable, please check the path");  
             System.exit(1);  
         }  
   
         IndexWriter writer = null;  
         try {  
             // 1、创建索引库IndexWriter  
             writer = getIndexWriter(indexPath);  
             index(writer, docDir);  
         } catch (IOException e) {  
             e.printStackTrace();  
         } finally {  
             writer.close();  
         }  
   
     }  
   
     private static IndexWriter getIndexWriter(String indexPath)  
             throws IOException {  
   
         Directory indexDir = FSDirectory.open(new File(indexPath));  
   
         IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_48,  
                 new StandardAnalyzer(Version.LUCENE_48));  
   
         IndexWriter writer = new IndexWriter(indexDir, iwc);  
   
         return writer;  
     }  
   
     private static void index(IndexWriter writer, File file) throws IOException {  
   
         if (file.isDirectory()) {  
             String[] files = file.list();  
             if (files != null) {  
                 for (int i = 0; i < files.length; i++) {  
                     index(writer, new File(file, files[i]));  
                 }  
             }  
         } else {  
             // 2、根据文件创建文档Document  
             Document doc = new Document();  
             Field pathField = new StringField("path", file.getPath(),  
                     Field.Store.YES);  
             doc.add(pathField);  
             doc.add(new LongField("modified", file.lastModified(),  
                     Field.Store.NO));  
             doc.add(new TextField("contents", new FileReader(file)));  
             System.out.println("Indexing " + file.getName());  
               
             // 3、向索引库中写入文档内容  
             writer.addDocument(doc);  
         }  
   
     }  
   
 }  

（1）使用“java indexfiles -index d:/index -docs d:/tmp”运行程序，索引d:/tmp中的文件，并将索引文件放置到d:/index。

（2）上述生成的索引文件可以使用Luke进行查看。目前Luke已迁移至github进行托管。

二、搜索文件

1、打开索引库IndexSearcher
2、根据关键词进行搜索
3、遍历结果并处理

[java]view plaincopy 
   
 package com.ljh.search.search;  
   
 //1、打开索引库IndexSearcher  
 //2、根据关键词进行搜索  
 //3、遍历结果并处理  
 import java.io.File;  
 import java.io.IOException;  
   
 import org.apache.lucene.index.DirectoryReader;  
 import org.apache.lucene.index.IndexReader;  
 import org.apache.lucene.index.Term;  
 import org.apache.lucene.search.IndexSearcher;  
 import org.apache.lucene.search.ScoreDoc;  
 import org.apache.lucene.search.TermQuery;  
 import org.apache.lucene.search.TopDocs;  
 import org.apache.lucene.store.Directory;  
 import org.apache.lucene.store.FSDirectory;  
   
 public class Searcher {  
     public static void main(String[] args) throws IOException {  
   
         String indexPath = null;  
         String term = null;  
         for (int i = 0; i < args.length; i++) {  
             if ("-index".equals(args[i])) {  
                 indexPath = args[i + 1];  
                 i++;  
             } else if ("-term".equals(args[i])) {  
                 term = args[i + 1];  
                 i++;  
             }  
         }  
   
         System.out.println("Searching " + term + " in " + indexPath);  
   
         // 1、打开索引库  
         Directory indexDir = FSDirectory.open(new File(indexPath));  
         IndexReader ir = DirectoryReader.open(indexDir);  
         IndexSearcher searcher = new IndexSearcher(ir);  
   
         // 2、根据关键词进行搜索  
         TopDocs docs = searcher.search(  
                 new TermQuery(new Term("contents", term)), 20);  
   
         // 3、遍历结果并处理  
         ScoreDoc[] hits = docs.scoreDocs;  
         System.out.println(hits.length);  
         for (ScoreDoc hit : hits) {  
             System.out.println("doc: " + hit.doc + " score: " + hit.score);  
         }  
   
         ir.close();  
   
     }  
   
 }  

三、分析

事实上，除了最基本的完全匹配搜索以外，其它都需要在搜索前进行分析。

如不加分析步骤，则搜索JAVA，是没有结果的，因为在索引过程中已经将词汇均转化为小写，而此处搜索时则要求关键字完全匹配。

使用了QueryParser类以后，则根据Analyzer的具体实现类，对搜索词汇进行分析，如大小写转换，java and ant等的搜索词解释等。

分析过程有2个基本步骤：

1、生成QueryParser对象

2、调用QueryParser.parse()生成Query()对象。

具体代码，将下述代码：

[java]view plaincopy 
   
 // 2、根据关键词进行搜索  
 TopDocs docs = searcher.search(  
         new TermQuery(new Term("contents", term)), 20);  

用以下代替：

[java]view plaincopy 
   
 // 2、根据关键词进行搜索  
 /*TopDocs docs = searcher.search( 
         new TermQuery(new Term("contents", term)), 10);*/  
 QueryParser parser = new QueryParser(Version.LUCENE_48, "contents", new SimpleAnalyzer(Version.LUCENE_48));  
 Query query = null;  
 try {  
     query = parser.parse(term);  
 } catch (ParseException e) {  
     e.printStackTrace();  
 }  
 TopDocs docs = searcher.search(query, 30);