【Lucene02】索引和搜索建立

最新推荐文章于 2023-06-29 15:11:54 发布

小松悦读会 | kevinelstri

最新推荐文章于 2023-06-29 15:11:54 发布

阅读量650

点赞数

分类专栏： Lucene 搜索引擎文章标签： lucene 索引搜索

本文链接：https://blog.csdn.net/kevinelstri/article/details/52356281

版权

搜索引擎同时被 2 个专栏收录

29 篇文章 0 订阅

订阅专栏

Lucene

17 篇文章 0 订阅

订阅专栏

1、索引

1、创建索引库IndexWriter
2、根据文件创建文档Document
3、向索引库中写入文档内容

对文档建立索引，Lucene提供了5个类：Document, Field, IndexWriter, Analyzer, Directory.

1、Document
     用来描述文档的，这里的文档可以指一个 HTML 页面，一封电子邮件，或者是一个文本文件。一个 Document 对象由多个 Field 对象组成的。可以把一个 Document 对象想象成数据库中的一个记录，而每个 Field 对象就是记录的一个字段。

2、Field
     Field 对象是用来描述一个文档的某个属性的，比如一封电子邮件的标题和内容可以用两个 Field 对象分别描述。

3、Analyzer
     在一个文档被索引之前，首先需要对文档内容进行分词处理，这部分工作就是由 Analyzer 来做的。Analyzer 类是一个抽象类，它有多个实现。针对不同的语言和应用需要选择适合的 Analyzer。Analyzer 把分词后的内容交给 IndexWriter 来建立索引。

4、IndexWriter
      IndexWriter 是 Lucene 用来创建索引的一个核心的类，它的作用是把一个个的 Document 对象加到索引中来。

5、Directory
     这个类代表了 Lucene 的索引的存储的位置，这是一个抽象类，它目前有两个实现，第一个是 FSDirectory，它表示一个存储在文件系统中的索引的位置。第二个是 RAMDirectory，它表示一个存储在内存当中的索引的位置。

package TestLucene; 
import java.io.File; 
import java.io.FileReader; 
import java.io.Reader; 
import java.util.Date; 
import org.apache.lucene.analysis.Analyzer; 
import org.apache.lucene.analysis.standard.StandardAnalyzer; 
import org.apache.lucene.document.Document; 
import org.apache.lucene.document.Field; 
import org.apache.lucene.index.IndexWriter; 

public class TxtFileIndexer { 

     public static void main(String[] args) throws Exception{ 

     File   indexDir = new File("D:\\luceneIndex"); 
     File   dataDir  = new File("D:\\luceneData"); 

     Analyzer luceneAnalyzer = new StandardAnalyzer(); 
     File[] dataFiles  = dataDir.listFiles(); 
     IndexWriter indexWriter = new IndexWriter(indexDir,luceneAnalyzer,true); 
     long startTime = new Date().getTime(); 
     for(int i = 0; i < dataFiles.length; i++){ 
          if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".txt")){
               System.out.println("Indexing file " + dataFiles[i].getCanonicalPath()); 
               Document document = new Document(); 
               Reader txtReader = new FileReader(dataFiles[i]); 
               document.add(Field.Text("path",dataFiles[i].getCanonicalPath())); 
               document.add(Field.Text("contents",txtReader)); 
               indexWriter.addDocument(document); 
          } 
     } 
     indexWriter.optimize(); 
     indexWriter.close(); 
     long endTime = new Date().getTime(); 

     System.out.println("It takes " + (endTime - startTime) 
         + " milliseconds to create index for the files in directory "
         + dataDir.getPath());        
     } 
}

package org.algorithm;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/*
 * lucene索引的建立
 * */
public class IndexFilesDemo {

    public static String content = "";

    //创建索引库IndexWriter
    public static IndexWriter getIndexWriter(String indexPath) throws IOException{
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
        Directory dir = FSDirectory.open(new File(indexPath));//确定索引文件存储的位置

        //创建索引
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_40, analyzer);
        IndexWriter writer = new IndexWriter(dir, config);
        return writer;
    }

    //根据文件创建文档Document
    public static void index(IndexWriter writer,String path) throws IOException{


        List<File> filelist = getFileList(path);
        for(File file :filelist){
            Document doc = new Document();

            //向索引库中写入文档内容
            doc.add(new StringField("filename", file.getName(),Store.YES));
            doc.add(new StringField("content", content, Store.YES));
            doc.add(new StringField("path", file.getPath(), Store.YES));

            writer.addDocument(doc);//添加进写入流里
            writer.forceMerge(1);//优化压缩段，大规模添加数据的时候才使用
            writer.commit();//提交数据
            System.out.println("索引添加成功！");
        }

    }

    public static List<File> getFileList(String path){
        File[] files = new File(path).listFiles();
        List<File> filelist = new ArrayList<File>();
        for(File file:files){
            filelist.add(file);
        }
        return filelist;
    }

    public static void main(String[] args) throws IOException {
        String indexPath = "E:/luceneTxt";
        String path = "e:/luceneindextxt";
        IndexWriter writer = getIndexWriter(path);
        getFileList(indexPath);
        index(writer,indexPath);
    }
}

2、搜索

1、打开索引库IndexSearcher
2、根据关键词进行搜索
3、遍历结果并处理

在这个索引上进行搜索以找到包含某个关键词或短语的文档。Lucene 提供了几个基础的类来完成这个过程，它们分别是呢 IndexSearcher, Term, Query, TermQuery, Hits.

1、IndexSearcher
IndexSearcher 是用来在建立好的索引上进行搜索的。它只能以只读的方式打开一个索引，所以可以有多个 IndexSearcher 的实例在一个索引上进行操作。

2、Term
Term 是搜索的基本单位，一个 Term 对象有两个 String 类型的域组成。生成一个 Term 对象可以有如下一条语句来完成：Term term = new Term(“fieldName”,”queryWord”); 其中第一个参数代表了要在文档的哪一个 Field 上进行查找，第二个参数代表了要查询的关键词。

3、Query
这是一个抽象类，它有多个实现，比如 TermQuery, BooleanQuery, PrefixQuery. 这个类的目的是把用户输入的查询字符串封装成 Lucene 能够识别的 Query

4、TermQuery
TermQuery 是抽象类 Query 的一个子类，它同时也是 Lucene 支持的最为基本的一个查询类。生成一个 TermQuery 对象由如下语句完成：
TermQuery termQuery = new TermQuery(new Term(“fieldName”,”queryWord”)); 它的构造函数只接受一个参数，那就是一个 Term 对象。

5、Hits
Hits 是用来保存搜索的结果的。

package TestLucene; 
 import java.io.File; 
 import org.apache.lucene.document.Document; 
 import org.apache.lucene.index.Term; 
 import org.apache.lucene.search.Hits; 
 import org.apache.lucene.search.IndexSearcher; 
 import org.apache.lucene.search.TermQuery; 
 import org.apache.lucene.store.FSDirectory; 

 public class TxtFileSearcher { 
     public static void main(String[] args) throws Exception{ 
        String queryStr = "lucene"; 

        File indexDir = new File("D:\\luceneIndex"); 
        FSDirectory directory = FSDirectory.getDirectory(indexDir,false); 
        IndexSearcher searcher = new IndexSearcher(directory); 
        if(!indexDir.exists()){ 
             System.out.println("The Lucene index is not exist"); 
             return; 
        } 
        Term term = new Term("contents",queryStr.toLowerCase()); 
        TermQuery luceneQuery = new TermQuery(term); 
        Hits hits = searcher.search(luceneQuery); 
        for(int i = 0; i < hits.length(); i++){ 
             Document document = hits.doc(i); 
             System.out.println("File: " + document.get("path")); 
        } 
     } 
 }

package org.algorithm;

import java.io.File;
import java.io.IOException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class IndexSearcherDemo {

    /*
     * 1、打开索引库IndexSearcher
     * 2、根据关键词进行搜索
     * 3、遍历结果并处理
     * */
    public static void searchIndex(String indexPath,String term) throws IOException{
         //打开索引库
         Directory directory = FSDirectory.open(new File(indexPath));//打开索引位置
         IndexReader reader = DirectoryReader.open(directory);
         IndexSearcher search = new IndexSearcher(reader);

         //根据关键词进行搜索
         TopDocs docs = search.search(new TermQuery(new Term("content",term)),20);

         //遍历结果并处理
         ScoreDoc[] hits = docs.scoreDocs;
         System.out.println(hits.length);
         for(ScoreDoc hit:hits){
             System.out.println("doc:"+hit.doc+".txt"+"   "+"score:"+hit.score);
         }  
         reader.close();     
    }
    public static void main(String[] args) throws IOException {
        String indexPath = "E:/luceneindextxt";
        searchIndex(indexPath,"you");
    }
}