基于Lucene 的一个简单文件索引

1.lucene适用范围
文本检索、网站信息检索、数据库搜索


2.lucene的组件
【1】.document对象表示被索引的文档,IndexWriter将文档add到index中
【2】用户query时,通过indexSearcher搜索lucene的index,同时计算term weight和score,之后返回结果

3.API调用方法:
  • 创建索引
  1. 创建indexWriter写入index,包括参数:1)INDEX_DIR,所以文件位置;2)Analyzer,文档词法分析
  2. 创建进行索引文档:new Document,把field放进去,indexWriter addDocument即可
  • 搜索
  1. 创建queryParse函数,parser查询语句,查询语法树放到query中
  2. 用indexSearcher调用search方法搜索语法树,得到结果

package com.triple.one.indexer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;

/**
 * Func:
 * Created by tripleone on 18/2/28.
 */
public class Indexer {


    private IndexWriter writer;

    public Indexer(String indexDir) throws IOException{
        Directory dir = FSDirectory.open(new File(indexDir));

        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);

        writer = new IndexWriter(dir, analyzer,true,
                IndexWriter.MaxFieldLength.UNLIMITED);

    }

    public void close() throws IOException{
        writer.close();
    }

    /**
     * 索引文件
     * @param dataDir
     * @param filter
     * @return
     * @throws IOException
     */
    public int index(String dataDir, FileFilter filter)
        throws IOException{
        File[] files = new File(dataDir).listFiles();

        for (File f: files){
            if(!f.isDirectory()
                    && !f.isHidden()
                    && f.exists()
                    && f.canRead()
                    && (filter == null || filter.accept(f))){
                try {
                    indexFile(f);
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        }
        return writer.numDocs();
    }

    private static class TextFilesFilter implements FileFilter{

        public boolean accept(File path){
            //只对.txt文件进行索引
            return path.getName().toLowerCase().endsWith(".txt");
        }
    }
    protected Document getDocument(File f)throws Exception{
        Document doc = new Document();
        doc.add(new Field("文档", new FileReader(f)));
        doc.add(new Field("文件名", f.getName(),
                Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("路径", f.getCanonicalPath(),
                Field.Store.YES, Field.Index.NOT_ANALYZED));
        return doc;

    }
    private void indexFile(File f)throws Exception{
        System.out.println("Indexing "+f.getCanonicalPath());
        Document doc = getDocument(f);
        writer.addDocument(doc);
    }



    public static void main(String[] args) throws IOException {
        System.out.println(System.getProperty("user.dir"));

        String indexDir = System.getProperty("user.dir") + "/test1"; //示例索引
        String dataDir = System.getProperty("user.dir") + "/test2" ; //对该目录下文档进行索引

        long start = System.currentTimeMillis();
        Indexer indexer = new Indexer(indexDir);
        int numIndexed = 0;

        try {
            numIndexed = indexer.index(dataDir, new TextFilesFilter());

        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            indexer.close();
        }

        long end = System.currentTimeMillis();
        System.out.println("搜索到 " + numIndexed + " 文件夹共花费 "
                + (end - start) + " 毫秒" );

    }
}

依赖包:

 <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-core -->
    <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-core</artifactId>
      <version>3.3.0</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-analyzers -->
    <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-analyzers</artifactId>
      <version>3.3.0</version>
    </dependency>


    <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-queryparser -->
    <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-queryparser</artifactId>
      <version>3.3.0</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-queries -->
    <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-queries</artifactId>
      <version>3.3.0</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-queries -->
    <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-queries</artifactId>
      <version>3.3.0</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-highlighter -->
    <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-highlighter</artifactId>
      <version>3.3.0</version>
    </dependency>
 


test2里共有6个以txt结尾的文件




评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值