lucene pdf+doc+ppt+xls+txt+多层文件

最新推荐文章于 2019-09-19 19:23:54 发布

allenshi_szl

最新推荐文章于 2019-09-19 19:23:54 发布

阅读量2k

点赞数

分类专栏： Nutch &amp; Lucene 文章标签： lucene exception string path import file

Nutch & Lucene 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

lucene pdf+doc+ppt+xls+txt+多层文件

2009-09-21 20:33

/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/

package stringtest;

import java.io.*;
import java.io.FileInputStream;
import java.io.File;

import org.apache.poi.hssf.extractor.*;
import org.apache.poi.hssf.usermodel.*;//包含生成Excel文档的各个类.

import org.apache.poi.hwpf.extractor.*;//对word文档进行处理的包

import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.usermodel.SlideShow;//对ppt文档进行处理的包

import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;//对pdf文档进行处理的包

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.queryParser.*;//lucene包

/**
*
* @author hp
*/
public class StringTest {
public static String INDEX_FILE_PATH = "F://index2";
public static String INDEX_STORE_PATH ="F://store2";

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args)throws Exception {
         StringTest test = new StringTest();
         IndexWriter writer = new IndexWriter(INDEX_STORE_PATH,new StandardAnalyzer(),true);
         test.writeToIndex(INDEX_FILE_PATH,writer);
         writer.close();
         test.indexSearcher("雒琛");

         //Mix mix = new Mix();
         //String string = mix.readPdf("F://ch07.pdf");
         //System.out.println(string);
        // TODO code application logic here
    }

    public void writeToIndex(String path,IndexWriter writer)throws Exception{
        File folder = new File(path);
        String[] files = folder.list();
        //System.out.println(files.length);

        for(int i=0;i<files.length;i++){
           //System.out.println(i);
           File file = new File(folder,files[i]);//根据 parent 路径名字符串和 child 路径名字符串创建一个新 File 实例
           String s = file.getAbsolutePath();
           //System.out.println(s);
       if(s.contains("."))    {
           int index = s.indexOf(".");
           String s1 = s.substring(index+1);
          //System.out.println(s);
           if(s1.equals("xls")){
               Document doc = readExcel(s);
               writer.addDocument(doc);
               writer.optimize();
           }
           else if(s1.equals("doc")){
               Document doc = readDoc(s);
               writer.addDocument(doc);
               writer.optimize();
           }
           else if(s1.equals("ppt")){
               Document doc = readPpt(s);
               writer.addDocument(doc);
               writer.optimize();
           }
           else if(s1.equals("pdf")){
               Document doc = readPdf(s);
               writer.addDocument(doc);
               writer.optimize();
           }
           else if(s1.equals("txt")){
               //File f= new File(s);
               Document doc = new Document();
               FileInputStream is = new FileInputStream(file);
               Reader reader = new BufferedReader(new InputStreamReader(is));

doc.add(new Field("content",reader));//,Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("path",s,Field.Store.YES,Field.Index.ANALYZED));

               writer.addDocument(doc);
               writer.optimize();
          }
           else continue;
        }
        else if(!s.contains(".")){ writeToIndex(s,writer);}
        }
   }

   public void indexSearcher(String s)throws Exception{
        //System.out.println(s);
        QueryParser paser = new QueryParser("content",new StandardAnalyzer());
        Query query = paser.parse(s);
        //System.out.println(query.toString());
        Searcher searcher = new IndexSearcher(INDEX_STORE_PATH);
        Hits hit = searcher.search(query);
        //System.out.println(hit.length());
        for(int i=0;i<hit.length();i++)
        {
            Document d = hit.doc(i);
            String dname = d.get("path");
            System.out.println(dname+" ");
         }
    }

   public Document readExcel(String xls)throws Exception {
        // 创建输入流读取xls文件
       //System.out.println(xls);
        InputStream in = new FileInputStream(xls);//xls文件存储地址
        HSSFWorkbook workbook = new HSSFWorkbook(in);   //读取一个文件
        ExcelExtractor extractor = new ExcelExtractor(workbook);

extractor.setFormulasNotResults(true);
extractor.setIncludeSheetNames(false);

        String text = extractor.getText(); //Retrieves the text contents of the file
        //System.out.println(text);
        //return text;//返回文件的Sting类型文字
        Document docexcel = new Document();
        docexcel.add(new Field("content",text,Field.Store.YES,Field.Index.ANALYZED));//,Field.Store.YES, Field.Index.ANALYZED));
        docexcel.add(new Field("path",xls,Field.Store.YES,Field.Index.ANALYZED));
        //System.out.println(text);
        return docexcel;

}

    public Document readDoc(String doc) throws Exception {
        // 创建输入流读取DOC文件
        FileInputStream in = new FileInputStream(doc);
        WordExtractor extractor = null; // 创建WordExtractor
        extractor = new WordExtractor(in);// 对DOC文件进行提取

String text = extractor.getText();

        Document docdoc = new Document();
        docdoc.add(new Field("content",text,Field.Store.YES,Field.Index.ANALYZED));//,Field.Store.YES, Field.Index.ANALYZED));
        docdoc.add(new Field("path",doc,Field.Store.YES,Field.Index.ANALYZED));
        return docdoc;

}

    public Document readPpt(String ppt)throws Exception {
         // 创建输入流读取ppt文件
        FileInputStream is = new FileInputStream(ppt);
        SlideShow ss = new SlideShow(new HSLFSlideShow(is));//is 为文件的InputStream，建立SlideShow
        Slide[] slides = ss.getSlides();//获得每一张幻灯片

        String text = new String();
        for(int i=0;i<slides.length;i++){
            TextRun[] t = slides[i].getTextRuns();//为了取得幻灯片的文字内容，建立TextRun
            for(int j=0;j<t.length;j++){
            //System.out.println(t[j].getText());//这里会将文字内容加到content中去
            text += t[j].getText();
            }
        }

        Document docppt = new Document();
        docppt.add(new Field("content",text,Field.Store.YES,Field.Index.ANALYZED));//,Field.Store.YES, Field.Index.ANALYZED));
        docppt.add(new Field("path",ppt,Field.Store.YES,Field.Index.ANALYZED));
        return docppt;
     }

     public Document readPdf(String pdf){
         // 创建输入流读取pdf文件
         String result="";
         FileInputStream is = null;
         PDDocument document = null;
          try {
              is = new FileInputStream(pdf);
              PDFParser parser = new PDFParser(is);
              parser.parse();
              document = parser.getPDDocument();
              PDFTextStripper stripper = new PDFTextStripper();
              result = stripper.getText(document);

}catch (Exception e) {

                e.printStackTrace();
              } finally {
                  if (is != null) {
                    try {
                          is.close();
                     } catch (Exception e) {
                        e.printStackTrace();
                     }
                   }
                if (document != null) {
                   try {
                    document.close();
                  } catch (Exception e) {
                       e.printStackTrace();
                  }
                 }
         }
          //System.out.println(result);
        Document docpdf = new Document();
        docpdf.add(new Field("content",result,Field.Store.YES,Field.Index.ANALYZED));//,Field.Store.YES, Field.Index.ANALYZED));
        docpdf.add(new Field("path",pdf,Field.Store.YES,Field.Index.ANALYZED));
        return docpdf;
      }