POI pdf ppt word excel

最新推荐文章于 2022-06-15 12:21:48 发布

蛰伏神兽

最新推荐文章于 2022-06-15 12:21:48 发布

阅读量1.3k

点赞数

Apache POI 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

原地址：http://topinking.iteye.com/blog/225178

关键字: word, excel, powerpoint, pdf, pdfbox

OFFICE文档使用 POI控件，PDF可以使用PDFBOX0.7.3控件，完全支持中文，用XPDF也行，不过感觉PDFBOX比较好，而且作者也在更新。水平有限，万望各位指正

WORD:

      Java代码   
      
    
 import org.apache.lucene.document.Document;  
 import org.apache.lucene.document.Field;  
 import org.apache.poi.hwpf.extractor.WordExtractor;  
   
 import java.io.File;  
 import java.io.InputStream;  
 import java.io.FileInputStream;  
   
 import com.search.code.Index;  
   
 public Document getDocument(Index index, String url, String title, InputStream is) throws DocCenterException {  
   
   String bodyText = null;  
   try {  
    WordExtractor ex = new WordExtractor(is);//is是WORD文件的InputStream   
    bodyText = ex.getText();  
    if(!bodyText.equals("")){  
     index.AddIndex(url, title, bodyText);  
    }  
   }catch (DocCenterException e) {  
    throw new DocCenterException("无法从该Mocriosoft Word文档中提取内容", e);  
   }catch(Exception e){  
    e.printStackTrace();  
   }  
 }  
   return null;  
  }  

Excel:

      Java代码   
      
    
 import org.apache.lucene.document.Document;  
 import org.apache.lucene.document.Field;  
   
 import org.apache.poi.hwpf.extractor.WordExtractor;  
 import  org.apache.poi.hssf.usermodel.HSSFWorkbook;   
 import  org.apache.poi.hssf.usermodel.HSSFSheet;   
 import  org.apache.poi.hssf.usermodel.HSSFRow;   
 import  org.apache.poi.hssf.usermodel.HSSFCell;   
   
 import java.io.File;  
 import java.io.InputStream;  
 import java.io.FileInputStream;  
   
 import com.search.code.Index;  
   
    
   
 public Document getDocument(Index index, String url, String title, InputStream is) throws DocCenterException {  
   StringBuffer content = new StringBuffer();  
   try{  
    HSSFWorkbook  workbook  =  new  HSSFWorkbook(is);//创建对Excel工作簿文件的引用   
    for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {  
     if (null != workbook.getSheetAt(numSheets)) {  
      HSSFSheet aSheet = workbook.getSheetAt(numSheets);//获得一个sheet  
         for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) {  
          if (null != aSheet.getRow(rowNumOfSheet)) {  
           HSSFRow aRow = aSheet.getRow(rowNumOfSheet); //获得一个行  
           for (short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) {  
            if (null != aRow.getCell(cellNumOfRow)) {  
             HSSFCell aCell = aRow.getCell(cellNumOfRow);//获得列值  
             content.append(aCell.getStringCellValue());  
            }  
           }  
          }  
         }  
     }  
    }  
    if(!content.equals("")){  
     index.AddIndex(url, title, content.toString());  
    }  
   }catch (DocCenterException e) {  
   
    throw new DocCenterException("无法从该Mocriosoft Word文档中提取内容", e);  
   }catch(Exception  e)  {   
    System.out.println("已运行xlRead()  :  "  +  e  );   
   }  
   return null;  
  }  

PowerPoint:

     Java代码   
     
   
 import java.io.InputStream;  
   
 import org.apache.lucene.document.Document;  
 import org.apache.poi.hslf.HSLFSlideShow;  
 import org.apache.poi.hslf.model.TextRun;  
 import org.apache.poi.hslf.model.Slide;  
 import org.apache.poi.hslf.usermodel.SlideShow;  
   
  public Document getDocument(Index index, String url, String title, InputStream is)  
  throws DocCenterException {  
   StringBuffer content = new StringBuffer("");  
   try{  
    SlideShow ss = new SlideShow(new HSLFSlideShow(is));//is 为文件的InputStream，建立SlideShow  
    Slide[] slides = ss.getSlides();//获得每一张幻灯片  
    for(int i=0;i     TextRun[] t = slides[i].getTextRuns();//为了取得幻灯片的文字内容，建立TextRun  
     for(int j=0;j      content.append(t[j].getText());//这里会将文字内容加到content中去  
     }  
     content.append(slides[i].getTitle());  
    }  
    index.AddIndex(url, title, content.toString());  
   }catch(Exception ex){  
    System.out.println(ex.toString());  
   }  
   return null;  
  }  
   
    

PDF:

     Java代码   
     
   
 import java.io.InputStream;  
 import java.io.IOException;  
 import org.apache.lucene.document.Document;  
   
 import org.pdfbox.cos.COSDocument;  
 import org.pdfbox.pdfparser.PDFParser;  
 import org.pdfbox.pdmodel.PDDocument;  
 import org.pdfbox.pdmodel.PDDocumentInformation;  
 import org.pdfbox.util.PDFTextStripper;  
   
 import com.search.code.Index;  
   
    
   
  public Document getDocument(Index index, String url, String title, InputStream is)throws DocCenterException {  
     
   COSDocument cosDoc = null;  
   try {  
    cosDoc = parseDocument(is);  
   } catch (IOException e) {  
    closeCOSDocument(cosDoc);  
    throw new DocCenterException("无法处理该PDF文档", e);  
   }  
   if (cosDoc.isEncrypted()) {  
    if (cosDoc != null)  
     closeCOSDocument(cosDoc);  
    throw new DocCenterException("该PDF文档是加密文档，无法处理");  
   }  
   String docText = null;  
   try {  
    PDFTextStripper stripper = new PDFTextStripper();  
    docText = stripper.getText(new PDDocument(cosDoc));  
   } catch (IOException e) {  
    closeCOSDocument(cosDoc);  
    throw new DocCenterException("无法处理该PDF文档", e);  
   }  
   
   PDDocument pdDoc = null;  
   try {  
    pdDoc = new PDDocument(cosDoc);  
    PDDocumentInformation docInfo = pdDoc.getDocumentInformation();  
    if(docInfo.getTitle()!=null && !docInfo.getTitle().equals("")){  
     title = docInfo.getTitle();  
    }  
   
   } catch (Exception e) {  
    closeCOSDocument(cosDoc);  
    closePDDocument(pdDoc);  
    System.err.println("无法取得该PDF文档的元数据" + e.getMessage());  
   } finally {  
    closeCOSDocument(cosDoc);  
    closePDDocument(pdDoc);  
   }  
     
   return null;  
  }  
   
  private static COSDocument parseDocument(InputStream is) throws IOException {  
   PDFParser parser = new PDFParser(is);  
   parser.parse();  
   return parser.getDocument();  
  }  
   
  private void closeCOSDocument(COSDocument cosDoc) {  
   if (cosDoc != null) {  
    try {  
     cosDoc.close();  
    } catch (IOException e) {  
    }  
   }  
  }  
   
  private void closePDDocument(PDDocument pdDoc) {  
   if (pdDoc != null) {  
    try {  
     pdDoc.close();  
    } catch (IOException e) {  
    }  
   }  
  }  

代码复制可能出错，不过代码经过测试，绝对能用，POI为3.0-rc4，PDFBOX为0.7.3

蛰伏神兽

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
POI pdf ppt word excel

原地址：http://topinking.iteye.com/blog/225178关键字: word, excel, powerpoint, pdf, pdfboxOFFICE文档使用POI控件，PDF可以使用PDFBOX0.7.3控件，完全支持中文，用XPDF也行，不过感觉PDFBOX比较好，而且作者也在更新。水平有限，万望各位指正 WORD:
复制链接

扫一扫