POI读取word2007+

最新推荐文章于 2024-07-26 16:03:13 发布
pengpengcoder
最新推荐文章于 2024-07-26 16:03:13 发布
阅读量737
点赞数
分类专栏： java Word 文章标签： Word
本文链接：https://blog.csdn.net/qq1620851849/article/details/79812654
版权
java 同时被 2 个专栏收录
18 篇文章 0 订阅
订阅专栏
Word
1 篇文章 0 订阅
订阅专栏
    很多人使用POI读取word的时候都会这么写：
 
         InputStream inputStream =  
         new  
         FileInputStream( 
         new  
         File( 
         "e://company/test.docx" 
         )); 
        
         XWPFDocument document =  
         new  
         XWPFDocument(inputStream); 
        
         System.out.println( 
         new  
         XWPFWordExtractor(document).getText()); 
        
    但是这个方法其实有非常多的问题的，文本框里面的内容读取不到，换行也有问题。那么我改进了一下：
 
         /** 
        
         * 处理2007+的WORD 
        
         * @param filePath 文件地址 
        
         * @return word内容 
        
         */ 
        
         private  
         static  
         String read2007(String filePath) { 
        
         InputStream inputStream =  
         null 
         ; 
        
         StringBuffer content =  
         new  
         StringBuffer(); 
        
         try  
         { 
        
         inputStream =  
         new  
         FileInputStream( 
         new  
         File(filePath)); 
        
         XWPFDocument document =  
         new  
         XWPFDocument(inputStream); 
        
         // 读取非表格文本框 
        
         for  
         (XWPFParagraph xwpfParagraph : document.getParagraphs()) { 
        
         for 
         (XWPFRun xwpfRun : xwpfParagraph.getRuns()) { 
        
         content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE); 
        
         } 
        
         } 
        
         // 读取表格内文本框 
        
         for 
         (XWPFTable xwpfTable : document.getTables()) { 
        
         for  
         (XWPFTableRow xwpfTableRow : xwpfTable.getRows()) { 
        
         for  
         (XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) { 
        
         for  
         (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) { 
        
         for 
         (XWPFRun xwpfRun : xwpfParagraph.getRuns()) { 
        
         content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE); 
        
         } 
        
         } 
        
         } 
        
         } 
        
         } 
        
         // 读取表格内容 
        
         for 
         (XWPFTable xwpfTable : document.getTables()) { 
        
         for  
         (XWPFTableRow xwpfTableRow : xwpfTable.getRows()) { 
        
         for  
         (XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) { 
        
         for  
         (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) { 
        
         content.append(xwpfParagraph.getText()).append(NEW_LINE); 
        
         } 
        
         } 
        
         } 
        
         } 
        
         return  
         content.toString(); 
        
         }  
         catch  
         (IOException e) { 
        
         logger.error( 
         "解析word错误，文件地址："  
         + filePath, e); 
        
         }  
         finally  
         { 
        
         IOUtils.closeQuietly(inputStream); 
        
         } 
        
         return  
         null 
         ; 
        
         } 
        
         /** 
        
         * 获取XML内容，可以使用递归cursor.getDomNode() 
        
         * @param xml xml 
        
         * @return xml内容 
        
         */ 
        
         private  
         static  
         String getXMLContent(String xml) { 
        
         StringBuffer content =  
         new  
         StringBuffer(); 
        
         Document document; 
        
         try  
         { 
        
         document = DocumentHelper.parseText(xml); 
        
         List<?> namespaces = document.getRootElement().declaredNamespaces();  
         // 判断是否有表格包含文本框 
        
         boolean  
         hasboxintab =  
         false 
         ; 
        
         for  
         (Object object : namespaces) { 
        
         Namespace namespace = (Namespace) object; 
        
         if 
         (NAMESPANCE_OF_TEXTBOX_IN_TABLE.equals(namespace.getPrefix())) { 
        
         hasboxintab =  
         true 
         ; 
        
         break 
         ; 
        
         } 
        
         } 
        
         if 
         (!hasboxintab) 
        
         return  
         content.toString(); 
        
         for 
         (Object node : document.selectNodes( 
         "//mc:Fallback//w:p" 
         )) { 
        
         for 
         (Object nodeb : ((Node) node).selectNodes( 
         ".//w:t" 
         )) { 
        
         if 
         (StringUtils.isNotEmpty(((Node) nodeb).getText())) 
        
         content.append(((Node) nodeb).getText()); 
        
         } 
        
         content.append(NEW_LINE); 
        
         } 
        
         }  
         catch  
         (DocumentException e) { 
        
         logger.error( 
         "XML转化错误，内容："  
         + xml, e); 
        
         } 
        
         return  
         content.toString(); 
        
         } 
        
    2003版本简单一些：
 
         /** 
        
         * 处理2003的WORD 
        
         * @param filePath 文件地址 
        
         * @return word内容 
        
         */ 
        
         private  
         static  
         String read2003(String filePath) { 
        
         InputStream inputStream =  
         null 
         ; 
        
         StringBuffer content =  
         new  
         StringBuffer(); 
        
         try  
         { 
        
         inputStream =  
         new  
         FileInputStream( 
         new  
         File(filePath)); 
        
         HWPFDocument document =  
         new  
         HWPFDocument(inputStream); 
        
         String text =  
         null 
         ; 
        
         for  
         ( 
         int  
         i =  
         0 
         ; i < document.getMainTextboxRange().numParagraphs(); i++) {  
         // 文本框 
        
         text = document.getMainTextboxRange().getParagraph(i).text(); 
        
         if 
         (StringUtils.isNotEmpty(text)) 
        
         content.append(text).append(NEW_LINE); 
        
         } 
        
         for  
         ( 
         int  
         i =  
         0 
         ; i < document.getRange().numParagraphs(); i++) {  
         // 非文本框 
        
         text = document.getRange().getParagraph(i).text(); 
        
         if 
         (StringUtils.isNotEmpty(text) && StringUtils.isNotEmpty(text.trim()))  
         // 注意这里的trim()方法否者会出现乱码 
        
         content.append(text.trim()).append(NEW_LINE); 
        
         } 
        
         return  
         content.toString(); 
        
         }  
         catch  
         (FileNotFoundException e) { 
        
         logger.error( 
         "解析word错误，文件地址："  
         + filePath, e); 
        
         }  
         catch  
         (IOException e) { 
        
         logger.error( 
         "解析word错误，文件地址："  
         + filePath, e); 
        
         }  
         finally  
         { 
        
         IOUtils.closeQuietly(inputStream); 
        
         } 
        
         return  
         null 
         ; 
        
         } 
        
    注意：读取出的内容为表格里面的内容，文本框内容和直接写在编辑区里面的文本，其他的一些诸如：批注，引用等一些信息可能读取不到，需要的请自行解决。
    比较完整的代码：
 
         import  
         java.io.File; 
        
         import  
         java.io.FileInputStream; 
        
         import  
         java.io.FileNotFoundException; 
        
         import  
         java.io.IOException; 
        
         import  
         java.io.InputStream; 
        
         import  
         org.apache.commons.io.FilenameUtils; 
        
         import  
         org.apache.commons.io.IOUtils; 
        
         import  
         org.apache.commons.lang.StringUtils; 
        
         import  
         org.apache.log4j.Logger; 
        
         import  
         org.apache.poi.hwpf.HWPFDocument; 
        
         import  
         org.apache.poi.xwpf.usermodel.XWPFDocument; 
        
         import  
         org.apache.poi.xwpf.usermodel.XWPFParagraph; 
        
         import  
         org.apache.poi.xwpf.usermodel.XWPFRun; 
        
         import  
         org.apache.poi.xwpf.usermodel.XWPFTable; 
        
         import  
         org.apache.poi.xwpf.usermodel.XWPFTableCell; 
        
         import  
         org.apache.poi.xwpf.usermodel.XWPFTableRow; 
        
         import  
         org.dom4j.Document; 
        
         import  
         org.dom4j.DocumentException; 
        
         import  
         org.dom4j.DocumentHelper; 
        
         import  
         org.dom4j.Node; 
        
         /** 
        
         * WordReaderUtils - WORD 读取 
        
         *  
        
         * @author 500d Team 
        
         * @version 1.0 
        
         */ 
        
         public  
         class  
         WordReaderUtils { 
        
         private  
         static  
         final  
         String WORD_2003 =  
         "doc" 
         ; 
        
         private  
         static  
         final  
         String WORD_2007 =  
         "docx" 
         ; 
        
         private  
         static  
         final  
         Logger logger = Logger.getLogger(WordReaderUtils. 
         class 
         ); 
        
         public  
         static  
         final  
         String NEW_LINE =  
         "\r\n" 
         ; 
        
         public  
         static  
         String read(String filePath) { 
        
         File wordFile = StringUtils.isNotEmpty(filePath) ?  
         new  
         File(filePath) :  
         null 
         ; 
        
         if  
         (wordFile ==  
         null  
         || !wordFile.exists() || !wordFile.isFile()) 
        
         return  
         null 
         ; 
        
         String extension = FilenameUtils.getExtension(filePath); 
        
         if 
         (StringUtils.isEmpty(extension)) 
        
         return  
         null 
         ; 
        
         String content =  
         null 
         ; 
        
         if 
         (WORD_2003.equals(extension.toLowerCase())) 
        
         content = read2003(filePath); 
        
         else  
         if 
         (WORD_2007.equals(extension.toLowerCase())) 
        
         content = read2007(filePath); 
        
         return  
         Crossover.handle(content); 
        
         } 
        
         /** 
        
         * 处理2003的WORD 
        
         * @param filePath 文件地址 
        
         * @return word内容 
        
         */ 
        
         private  
         static  
         String read2003(String filePath) { 
        
         InputStream inputStream =  
         null 
         ; 
        
         StringBuffer content =  
         new  
         StringBuffer(); 
        
         try  
         { 
        
         inputStream =  
         new  
         FileInputStream( 
         new  
         File(filePath)); 
        
         HWPFDocument document =  
         new  
         HWPFDocument(inputStream); 
        
         String text =  
         null 
         ; 
        
         for  
         ( 
         int  
         i =  
         0 
         ; i < document.getMainTextboxRange().numParagraphs(); i++) { 
        
         text = document.getMainTextboxRange().getParagraph(i).text(); 
        
         if 
         (StringUtils.isNotEmpty(text)) 
        
         content.append(text).append(NEW_LINE); 
        
         } 
        
         for  
         ( 
         int  
         i =  
         0 
         ; i < document.getRange().numParagraphs(); i++) { 
        
         text = document.getRange().getParagraph(i).text(); 
        
         if 
         (StringUtils.isNotEmpty(text) && StringUtils.isNotEmpty(text.trim()))  
         // 注意这里的trim()方法否者会出现乱码 
        
         content.append(text.trim()).append(NEW_LINE); 
        
         } 
        
         return  
         content.toString(); 
        
         }  
         catch  
         (FileNotFoundException e) { 
        
         logger.error( 
         "解析word错误，文件地址："  
         + filePath, e); 
        
         }  
         catch  
         (IOException e) { 
        
         logger.error( 
         "解析word错误，文件地址："  
         + filePath, e); 
        
         }  
         finally  
         { 
        
         IOUtils.closeQuietly(inputStream); 
        
         } 
        
         return  
         null 
         ; 
        
         } 
        
         /** 
        
         * 处理2007+的WORD 
        
         * @param filePath 文件地址 
        
         * @return word内容 
        
         */ 
        
         private  
         static  
         String read2007(String filePath) { 
        
         InputStream inputStream =  
         null 
         ; 
        
         StringBuffer content =  
         new  
         StringBuffer(); 
        
         try  
         { 
        
         inputStream =  
         new  
         FileInputStream( 
         new  
         File(filePath)); 
        
         XWPFDocument document =  
         new  
         XWPFDocument(inputStream); 
        
         // 读取非表格文本框 
        
         for  
         (XWPFParagraph xwpfParagraph : document.getParagraphs()) { 
        
         for 
         (XWPFRun xwpfRun : xwpfParagraph.getRuns()) { 
        
         content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE); 
        
         } 
        
         } 
        
         // 读取表格内文本框 
        
         for 
         (XWPFTable xwpfTable : document.getTables()) { 
        
         for  
         (XWPFTableRow xwpfTableRow : xwpfTable.getRows()) { 
        
         for  
         (XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) { 
        
         for  
         (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) { 
        
         for 
         (XWPFRun xwpfRun : xwpfParagraph.getRuns()) { 
        
         content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE); 
        
         } 
        
         } 
        
         } 
        
         } 
        
         } 
        
         // 读取表格内容 
        
         for 
         (XWPFTable xwpfTable : document.getTables()) { 
        
         for  
         (XWPFTableRow xwpfTableRow : xwpfTable.getRows()) { 
        
         for  
         (XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) { 
        
         for  
         (XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) { 
        
         content.append(xwpfParagraph.getText()).append(NEW_LINE); 
        
         } 
        
         } 
        
         } 
        
         } 
        
         return  
         content.toString(); 
        
         }  
         catch  
         (IOException e) { 
        
         logger.error( 
         "解析word错误，文件地址："  
         + filePath, e); 
        
         }  
         finally  
         { 
        
         IOUtils.closeQuietly(inputStream); 
        
         } 
        
         return  
         null 
         ; 
        
         } 
        
         /** 
        
         * 获取XML内容，可以使用递归cursor.getDomNode() 
        
         * @param xml xml 
        
         * @return xml内容 
        
         */ 
        
         private  
         static  
         String getXMLContent(String xml) { 
        
         StringBuffer content =  
         new  
         StringBuffer(); 
        
         Document document; 
        
         try  
         { 
        
         document = DocumentHelper.parseText(xml); 
        
         List<?> namespaces = document.getRootElement().declaredNamespaces();  
         // 判断是否有表格包含文本框 
        
         boolean  
         hasboxintab =  
         false 
         ; 
        
         for  
         (Object object : namespaces) { 
        
         Namespace namespace = (Namespace) object; 
        
         if 
         (NAMESPANCE_OF_TEXTBOX_IN_TABLE.equals(namespace.getPrefix())) { 
        
         hasboxintab =  
         true 
         ; 
        
         break 
         ; 
        
         } 
        
         } 
        
         if 
         (!hasboxintab) 
        
         return  
         content.toString(); 
        
         for 
         (Object node : document.selectNodes( 
         "//mc:Fallback//w:p" 
         )) { 
        
         for 
         (Object nodeb : ((Node) node).selectNodes( 
         ".//w:t" 
         )) { 
        
         if 
         (StringUtils.isNotEmpty(((Node) nodeb).getText())) 
        
         content.append(((Node) nodeb).getText()); 
        
         } 
        
         content.append(NEW_LINE); 
        
         } 
        
         }  
         catch  
         (DocumentException e) { 
        
         logger.error( 
         "XML转化错误，内容："  
         + xml, e); 
        
         } 
        
         return  
         content.toString(); 
        
         } 
        
         public  
         static  
         void  
         main(String[] args)  
         throws  
         Exception { 
        
         //      System.out.println(read("e://company/test.doc")); 
        
         //      System.out.println(read("e://company/test.docx")); 
        
         } 
        
         }