基于Lucene 的一个简单文件索引

最新推荐文章于 2020-05-31 20:58:41 发布

洞之蝉

最新推荐文章于 2020-05-31 20:58:41 发布

阅读量363

点赞数

分类专栏： Java

本文链接：https://blog.csdn.net/muyimo/article/details/79400672

版权

Java 专栏收录该内容

37 篇文章 1 订阅

订阅专栏

  1.lucene适用范围 

  文本检索、网站信息检索、数据库搜索 

  2.lucene的组件 

  【1】.document对象表示被索引的文档，IndexWriter将文档add到index中 

  【2】用户query时，通过indexSearcher搜索lucene的index，同时计算term weight和score，之后返回结果 

  3.API调用方法： 

创建索引

创建indexWriter写入index，包括参数：1）INDEX_DIR,所以文件位置；2)Analyzer,文档词法分析
创建进行索引文档：new Document，把field放进去，indexWriter addDocument即可

搜索

创建queryParse函数，parser查询语句，查询语法树放到query中
用indexSearcher调用search方法搜索语法树，得到结果

package com.triple.one.indexer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;

/**
 * Func:
 * Created by tripleone on 18/2/28.
 */
public class Indexer {


    private IndexWriter writer;

    public Indexer(String indexDir) throws IOException{
        Directory dir = FSDirectory.open(new File(indexDir));

        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);

        writer = new IndexWriter(dir, analyzer,true,
                IndexWriter.MaxFieldLength.UNLIMITED);

    }

    public void close() throws IOException{
        writer.close();
    }

    /**
     * 索引文件
     * @param dataDir
     * @param filter
     * @return
     * @throws IOException
     */
    public int index(String dataDir, FileFilter filter)
        throws IOException{
        File[] files = new File(dataDir).listFiles();

        for (File f: files){
            if(!f.isDirectory()
                    && !f.isHidden()
                    && f.exists()
                    && f.canRead()
                    && (filter == null || filter.accept(f))){
                try {
                    indexFile(f);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }
        return writer.numDocs();
    }

    private static class TextFilesFilter implements FileFilter{

        public boolean accept(File path){
            //只对.txt文件进行索引
            return path.getName().toLowerCase().endsWith(".txt");
        }
    }
    protected Document getDocument(File f)throws Exception{
        Document doc = new Document();
        doc.add(new Field("文档", new FileReader(f)));
        doc.add(new Field("文件名", f.getName(),
                Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("路径", f.getCanonicalPath(),
                Field.Store.YES, Field.Index.NOT_ANALYZED));
        return doc;

    }
    private void indexFile(File f)throws Exception{
        System.out.println("Indexing "+f.getCanonicalPath());
        Document doc = getDocument(f);
        writer.addDocument(doc);
    }



    public static void main(String[] args) throws IOException {
        System.out.println(System.getProperty("user.dir"));

        String indexDir = System.getProperty("user.dir") + "/test1"; //示例索引
        String dataDir = System.getProperty("user.dir") + "/test2" ; //对该目录下文档进行索引

        long start = System.currentTimeMillis();
        Indexer indexer = new Indexer(indexDir);
        int numIndexed = 0;

        try {
            numIndexed = indexer.index(dataDir, new TextFilesFilter());

        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            indexer.close();
        }

        long end = System.currentTimeMillis();
        System.out.println("搜索到 " + numIndexed + " 文件夹共花费 "
                + (end - start) + " 毫秒" );

    }
}

依赖包：

 <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-core -->
    <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-core</artifactId>
      <version>3.3.0</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-analyzers -->
    <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-analyzers</artifactId>
      <version>3.3.0</version>
    </dependency>


    <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-queryparser -->
    <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-queryparser</artifactId>
      <version>3.3.0</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-queries -->
    <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-queries</artifactId>
      <version>3.3.0</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-queries -->
    <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-queries</artifactId>
      <version>3.3.0</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-highlighter -->
    <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-highlighter</artifactId>
      <version>3.3.0</version>
    </dependency>

test2里共有6个以txt结尾的文件