lucene全文检索

最新推荐文章于 2024-07-25 18:31:33 发布
jc橙子
最新推荐文章于 2024-07-25 18:31:33 发布
阅读量305
点赞数
分类专栏： lucene-java 文章标签： lucene 全文检索 excel word pdf
本文链接：https://blog.csdn.net/sinat_36795605/article/details/66970829
版权
lucene-java 专栏收录该内容
1 篇文章 0 订阅
订阅专栏
lucene全文检索

适用于word,ppt ,excel,pdf
代码块Indexer

lucene检索代码：
package com.fzky.diams.web.luncene;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class Indexer {
    private IndexWriter writer; //写索引实例
    //构造方法，实例化IndexWriter
    public Indexer(String indexDir) throws Exception {
        Directory dir = FSDirectory.open(Paths.get(indexDir));
//      Analyzer analyzer = new StandardAnalyzer(); //标准分词器，会自动去掉空格啊，is a the等单词
//      SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();
        Analyzer analyzer=new MyIkAnalyzer();     
        IndexWriterConfig config = new IndexWriterConfig(analyzer); //将标准分词器配到写索引的配置中
        config.setOpenMode(OpenMode.CREATE);
        writer = new IndexWriter(dir, config); //实例化写索引对象
       // writer.deleteAll();//清除以前的index
        }
    //关闭写索引
    public void close() throws Exception {
        writer.close();
    }

    //索引指定目录下的所有文件
    public int indexAll(String dataDir) throws Exception {
        File[] files = new File(dataDir).listFiles(); //获取该路径下的所有文件
        for(File file : files) {
            indexFile(file); //调用下面的indexFile方法，对每个文件进行索引
        }
        return writer.numDocs(); //返回索引的文件数
    }

    //索引指定的文件
    private void indexFile(File file) throws Exception {
        System.out.println("索引文件的路径：" + file.getCanonicalPath());
        Document doc = getDocument(file); //获取该文件的document
        writer.addDocument(doc); //调用下面的getDocument方法，将doc添加到索引中
        System.out.println("索引数：：："+writer.numDocs());
    }

    //获取文档，文档里再设置每个字段，就类似于数据库中的一行记录
    private Document getDocument(File file) throws Exception{
        Document doc = new Document();
        //添加字段
       //BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "GBK"));
        String filepath =file.getAbsolutePath().toString();
        String ftype = filepath.substring(filepath.lastIndexOf(".") + 1);// 文件后缀
        String reader = "";
        ReaderFile readerfile = new ReaderFile();
        switch (ftype) {
        case "txt":
            reader = readerfile.readFile(filepath);
            break;
        case "doc":
            reader = readerfile.readWord(filepath);
            break;
        case "docx":
            reader = readerfile.readWordDocx(filepath);
            break;
        case "xlsx":
            reader = readerfile.getTextFromExcel2007(filepath);
            break;
        case " xls":
            reader = readerfile.getTextFromExcel(filepath);
            break;
        case "pdf":
            reader = readerfile.readPdf(filepath);
            break;
        case " ppt":
            reader = readerfile.getTextFromPPT(filepath);
            break;
        case "pptx":
            reader = readerfile.getTextFromPPT2007(filepath);
            break;
        default:
            break;
        }
        doc.add(new Field("contents",reader,TextField.TYPE_STORED)); //添加内容
        doc.add(new Field("fileName", file.getName(), TextField.TYPE_STORED)); //添加文件名，并把这个字段存到索引文件里
        doc.add(new Field("fullPath", file.getCanonicalPath(),TextField.TYPE_STORED)); //添加文件路径
//        doc.add(new TextField("fullPath", file.getCanonicalPath(),Field.Store.YES)); //添加文件路径
        return doc;
    }

    /**
     * 遍历文件夹中的文件
     * @param datadir
     */
    static List<String> list = new ArrayList<String>();
    private static List<String> indexDocsNew(String datadir)  throws IOException {
        List<String> newlist = new ArrayList<String>();
        newlist=indexDocs(datadir);
        list = new ArrayList<String>();
        return newlist; 

    }
    private static List<String> indexDocs(String datadir)  throws IOException { 
        File file=new File(datadir);
        String dirson="";
        if (file.canRead()) { 
               // 如果file是一个目录(该目录下面可能有文件、目录文件、空文件三种情况)
               if (file.isDirectory()) { 
                   // 获取file目录下的所有文件(包括目录文件)File对象，放到数组files里 
                   String[] files = file.list(); 
                   if (files != null) {// 如果files!=null  
                       // 对files数组里面的File对象递归索引，通过广度遍历
                       for (int i = 0; i < files.length; i++)  {
                           dirson=new File(file, files[i]).getAbsolutePath();
                           indexDocs(dirson);
                       }
                   }
               }else { // 到达叶节点时，说明是一个File，而不是目录，则建立索引
                       list.add(file.getAbsolutePath());
               } 
        }
        return list;
    }

        public static void index(String indexdir,String datadir) throws IOException {
                String indexDir = indexdir; //将索引保存到的路径
                List<String> dataDir = indexDocsNew(datadir); //需要索引的文件数据存放的目录
                Indexer indexer = null;
                long startTime = System.currentTimeMillis(); //记录索引开始时间
                try {
                    indexer = new Indexer(indexDir);
                    for(int i=0;i<dataDir.size();i++){
                         indexer.indexFile(new File(dataDir.get(i).toString()));
                    }

                } catch (Exception e) {
                    e.printStackTrace();
                } finally {
                    try {
                        indexer.close();
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }
                long endTime = System.currentTimeMillis(); //记录索引结束时间
                System.out.println("索引耗时" + (endTime-startTime) + "毫秒");
            }
    /*  public static void main(String[] args) throws IOException {
            String indexdir="D:\\lucene";
            String datadir="D:\\lucene\\data";
//          String indexdir="D:\\workspace\\.metadata\\.plugins\\org.eclipse.wst.server.core\\tmp1\\wtpwebapps\\diams\\index";
//          String datadir="D:\\workspace\\.metadata\\.plugins\\org.eclipse.wst.server.core\\tmp1\\wtpwebapps\\diams\\file";
            index(indexdir,datadir);
        }*/ 


}


### 代码块Searcher
lucene搜索代码 ：
``` python
package com.fzky.diams.web.luncene;

import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class Searcher {

    public static List<String> search(String indexDir, String q) throws Exception {
        ArrayList<String>  list = new ArrayList<String>();
        Directory dir = FSDirectory.open(Paths.get(indexDir)); //获取要查询的路径，也就是索引所在的位置
        IndexReader reader = DirectoryReader.open(dir);
        IndexSearcher searcher = new IndexSearcher(reader);
//        Analyzer analyzer = new StandardAnalyzer(); //标准分词器，会自动去掉空格啊，is a the等单词
//          SmartChineseAnalyzer analyzer=new SmartChineseAnalyzer();//中文分词
        Analyzer analyzer=new MyIkAnalyzer();
        QueryParser parser = new QueryParser("contents", analyzer); //查询解析器
        Query query = parser.parse(q); //通过解析要查询的String，获取查询对象
        long startTime = System.currentTimeMillis(); //记录索引开始时间
        TopDocs docs = searcher.search(query, 10);//开始查询，查询前10条数据，将记录保存在docs中


        long endTime = System.currentTimeMillis(); //记录索引结束时间
        System.out.println("匹配" + q + "共耗时" + (endTime-startTime) + "毫秒");
        System.out.println("查询到" + docs.totalHits + "条记录");

        for(ScoreDoc scoreDoc : docs.scoreDocs) { //取出每条查询结果
            Document doc = searcher.doc(scoreDoc.doc); //scoreDoc.doc相当于docID,根据这个docID来获取文档
            System.out.println(doc.get("fullPath")); //fullPath是刚刚建立索引的时候我们定义的一个字段
            list.add(doc.get("fullPath"));
//            System.out.println("查询名字" + doc.get("fileName"));
        }
        reader.close();
        return list;
    }
   /* public static void main(String[] args) {
        String indexDir = "D:\\lucene";
        String q = "关系"; //查询这个字符串
        try {
            search(indexDir, q);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }*/
}
jc橙子
关注
0
点赞
踩
2

收藏

觉得还不错? 一键收藏
1
评论
lucene全文检索

lucene全文检索适用于word,ppt ,excel,pdf代码块Indexerlucene检索代码：package com.fzky.diams.web.luncene;import java.io.File;import java.io.IOException;import java.nio.file.Paths;import java.util.ArrayList;impo
复制链接

扫一扫