JAVA提取Word,Excel,PPT,PDF,TXT等文档文字内容

最新推荐文章于 2022-02-10 14:49:27 发布

u010463032

最新推荐文章于 2022-02-10 14:49:27 发布

阅读量872

点赞数

分类专栏： JavaEE

JavaEE 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

首先引入Maven库

[html]view plain copy 
   
 <dependency>  
     <groupId>org.apache.poi</groupId>  
     <artifactId>poi</artifactId>  
     <version>3.15</version>  
 </dependency>  
 <dependency>  
     <groupId>org.apache.poi</groupId>  
     <artifactId>poi-ooxml</artifactId>  
     <version>3.15</version>  
 </dependency>  
 <dependency>  
     <groupId>org.apache.poi</groupId>  
     <artifactId>poi-scratchpad</artifactId>  
     <version>3.15</version>  
 </dependency>  
 <dependency>  
     <groupId>org.apache.pdfbox</groupId>  
     <artifactId>pdfbox</artifactId>  
     <version>2.0.4</version>  
 </dependency>  

[java]view plain copy 
   
   

[java]view plain copy 
   
 <pre name="code" class="html">public class ParseText {  
   
     // 判断文档类型，调用不同的解析方法  
     public static String parse(byte[] buffer, String suffix) {  
         String text = "";  
         switch (suffix) {  
         case "doc":  
             text = getTextFromWord(buffer);  
             break;  
         case "docx":  
             text = getTextFromWord2007(buffer);  
             break;  
         case "xls":  
             text = getTextFromExcel(buffer);  
             break;  
         case "xlsx":  
             text = getTextFromExcel2007(buffer);  
             break;  
         case "ppt":  
             text = getTextFromPPT(buffer);  
             break;  
         case "pptx":  
             text = getTextFromPPT2007(buffer);  
             break;  
         case "pdf":  
             text = getTextFormPDF(buffer);  
             break;  
         case "txt":  
             text = getTextFormTxt(buffer);  
             break;  
         default:  
             System.out.println("不支持解析的文档类型");  
         }  
   
         return text.replaceAll("\\s*", "");  
     }  
   
     // 读取Word97-2003的全部内容 doc  
     private static String getTextFromWord(byte[] file) {  
         String text = "";  
         InputStream fis = null;  
         WordExtractor ex = null;  
         try {  
             // word 2003： 图片不会被读取  
             fis = new ByteArrayInputStream(file);  
             ex = new WordExtractor(fis);  
             text = ex.getText();  
             ex.close();  
         } catch (Exception e) {  
             // TODO Auto-generated catch block  
             e.printStackTrace();  
         }  
         return text;  
     }  
   
     // 读取Word2007+的全部内容 docx  
     private static String getTextFromWord2007(byte[] file) {  
         String text = "";  
         InputStream fis = null;  
         XWPFDocument doc = null;  
         XWPFWordExtractor workbook = null;  
         try {  
             fis = new ByteArrayInputStream(file);  
             doc = new XWPFDocument(fis);  
             workbook = new XWPFWordExtractor(doc);  
             text = workbook.getText();  
             workbook.close();  
         } catch (IOException e) {  
             // TODO Auto-generated catch block  
             e.printStackTrace();  
         }  
         return text;  
     }  
   
     // 读取Excel97-2003的全部内容 xls  
     private static String getTextFromExcel(byte[] file) {  
         InputStream is = null;  
         HSSFWorkbook wb = null;  
         String text = "";  
         try {  
             is = new ByteArrayInputStream(file);  
             wb = new HSSFWorkbook(new POIFSFileSystem(is));  
             ExcelExtractor extractor = new ExcelExtractor(wb);  
             extractor.setFormulasNotResults(false);  
             extractor.setIncludeSheetNames(false);  
             text = extractor.getText();  
             extractor.close();  
         } catch (IOException e) {  
             e.printStackTrace();  
         }  
         return text;  
     }  
   
     // 读取Excel2007+的全部内容 xlsx  
     private static String getTextFromExcel2007(byte[] file) {  
         InputStream is = null;  
         XSSFWorkbook workBook = null;  
         String text = "";  
         try {  
             is = new ByteArrayInputStream(file);  
             workBook = new XSSFWorkbook(is);  
             XSSFExcelExtractor extractor = new XSSFExcelExtractor(workBook);  
             extractor.setIncludeSheetNames(false);  
             text = extractor.getText();  
             extractor.close();  
         } catch (IOException e) {  
             e.printStackTrace();  
         }  
         return text;  
     }  
   
     // 读取Powerpoint97-2003的全部内容 ppt  
     private static String getTextFromPPT(byte[] file) {  
         String text = "";  
         InputStream fis = null;  
         PowerPointExtractor ex = null;  
         try {  
             // word 2003： 图片不会被读取  
             fis = new ByteArrayInputStream(file);  
             ex = new PowerPointExtractor(fis);  
             text = ex.getText();  
             ex.close();  
         } catch (Exception e) {  
             // TODO Auto-generated catch block  
             e.printStackTrace();  
         }  
         return text;  
     }  
   
     // 抽取幻灯片2007+全部内容 pptx  
     private static String getTextFromPPT2007(byte[] file) {  
         InputStream is = null;  
         XMLSlideShow slide = null;  
         String text = "";  
         try {  
             is = new ByteArrayInputStream(file);  
             slide = new XMLSlideShow(is);  
             XSLFPowerPointExtractor extractor = new XSLFPowerPointExtractor(slide);  
             text = extractor.getText();  
             extractor.close();  
         } catch (IOException e) {  
             e.printStackTrace();  
         }  
         return text;  
     }  
   
     // 读取pdf文件全部内容 pdf  
     private static String getTextFormPDF(byte[] file) {  
         String text = "";  
         PDDocument pdfdoc = null;  
         InputStream is = null;  
         try {  
             is = new ByteArrayInputStream(file);  
             pdfdoc = PDDocument.load(is);  
             PDFTextStripper stripper = new PDFTextStripper();  
             text = stripper.getText(pdfdoc);  
   
         } catch (IOException e) {  
             e.printStackTrace();  
         } finally {  
             try {  
                 if (pdfdoc != null) {  
                     pdfdoc.close();  
                 }  
             } catch (IOException e) {  
                 // TODO Auto-generated catch block  
                 e.printStackTrace();  
             }  
         }  
         return text;  
     }  
   
     // 读取txt文件全部内容 txt  
     private static String getTextFormTxt(byte[] file) {  
         String text = "";  
         try {  
             String encoding = get_charset(file);  
             text = new String(file, encoding);  
         } catch (UnsupportedEncodingException e) {  
             e.printStackTrace();  
         } catch (IOException e1) {  
             e1.printStackTrace();  
         }  
         return text;  
     }  
   
     // 获得txt文件编码方式  
     private static String get_charset(byte[] file) throws IOException {  
         String charset = "GBK";  
         byte[] first3Bytes = new byte[3];  
         InputStream bis = null;  
         try {  
             boolean checked = false;  
             bis = new ByteArrayInputStream(file);  
             bis.mark(0);  
             int read = bis.read(first3Bytes, 0, 3);  
             if (read == -1)  
                 return charset;  
             if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {  
                 charset = "UTF-16LE";  
                 checked = true;  
             } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) {  
                 charset = "UTF-16BE";  
                 checked = true;  
             } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB  
                     && first3Bytes[2] == (byte) 0xBF) {  
                 charset = "UTF-8";  
                 checked = true;  
             }  
             bis.reset();  
             if (!checked) {  
                 while ((read = bis.read()) != -1) {  
                     if (read >= 0xF0)  
                         break;  
                     if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的，也算是GBK  
                         break;  
                     if (0xC0 <= read && read <= 0xDF) {  
                         read = bis.read();  
                         if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)  
                             // (0x80 - 0xBF),也可能在GB编码内  
                             continue;  
                         else  
                             break;  
                     } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错，但是几率较小  
                         read = bis.read();  
                         if (0x80 <= read && read <= 0xBF) {  
                             read = bis.read();  
                             if (0x80 <= read && read <= 0xBF) {  
                                 charset = "UTF-8";  
                                 break;  
                             } else  
                                 break;  
                         } else  
                             break;  
                     }  
                 }  
             }  
         } catch (Exception e) {  
             e.printStackTrace();  
         } finally {  
             if (bis != null) {  
                 bis.close();  
             }  
         }  
         return charset;  
     }  
 }</pre><br>  
 <br>  
 <p></p>  
 <pre></pre>  
 <br>  
 <p></p>  
 <div style="top:628px"><pre name="code" class="java">   // 读取pdf文件  
     private static String getTextFormPDF(byte[] file) {  
         String text = "";  
         PDDocument pdfdoc = null;  
         InputStream is = null;  
         try {  
             is = new ByteArrayInputStream(file);  
             pdfdoc = PDDocument.load(is);  
             PDFTextStripper stripper = new PDFTextStripper();  
             text = stripper.getText(pdfdoc);  
   
         } catch (IOException e) {  
             e.printStackTrace();  
         } finally {  
             try {  
                 if (pdfdoc != null) {  
                     pdfdoc.close();  
                 }  
             } catch (IOException e) {  
                 // TODO Auto-generated catch block  
                 e.printStackTrace();  
             }  
         }  
         return text;  
     }  
   
     // 读取txt文件  
     private static String getTextFormTxt(byte[] file) {  
         String text = "";  
         try {  
             String encoding = get_charset(file);  
             text = new String(file, encoding);  
         } catch (UnsupportedEncodingException e) {  
             e.printStackTrace();  
         } catch (IOException e1) {  
             e1.printStackTrace();  
         }  
         return text;  
     }</pre></div>  

u010463032

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
JAVA提取Word,Excel,PPT,PDF,TXT等文档文字内容

首先引入Maven库[html] view plain copydependency> groupId>org.apache.poigroupId> artifactId>poiartifactId> version>3.15version> dependency> dependency>
复制链接

扫一扫

专栏目录