Java解析pdf、doc、docx、ppt、xls等文件

最新推荐文章于 2024-04-16 22:06:56 发布

小周先生~

最新推荐文章于 2024-04-16 22:06:56 发布

阅读量1.7k

点赞数 2

文章标签： java

本文链接：https://blog.csdn.net/xinx98/article/details/119112830

版权

本文是从一个大佬那里摘过来的，忘记是哪个了，在这里向大佬道个歉，并表示感谢，感谢大佬的技术分享，我就厚着脸皮发布了！😉
1.所需依赖

<dependency>
       <groupId>org.apache.pdfbox</groupId>
       <artifactId>pdfbox</artifactId>
       <version>2.0.2</version>
   </dependency>
   <dependency>
       <groupId>org.apache.poi</groupId>
       <artifactId>poi</artifactId>
       <version>3.15</version>
   </dependency>
   <dependency>
       <groupId>org.apache.poi</groupId>
       <artifactId>poi-ooxml</artifactId>
       <version>3.15</version>
   </dependency>
   <dependency>
       <groupId>org.apache.poi</groupId>
       <artifactId>poi-ooxml-schemas</artifactId>
       <version>3.15</version>
   </dependency>
   <dependency>
       <groupId>org.apache.poi</groupId>
       <artifactId>poi-scratchpad</artifactId>
       <version>3.15</version>
   </dependency>
   <dependency>
       <groupId>org.apache.xmlbeans</groupId>
       <artifactId>xmlbeans</artifactId>
       <version>2.6.0</version>
   </dependency>
   <dependency>
       <groupId>dom4j</groupId>
       <artifactId>dom4j</artifactId>
       <version>1.6.1</version>
   </dependency>

2.代码示例

 /**
     * 读取doc文件
     * @param filePath
     * @throws Exception
     */
    public static String getTextFromDoc(String filePath) throws Exception{
        StringBuilder sb = new StringBuilder();
        FileInputStream fis = new FileInputStream(new File(filePath));
        HWPFDocument doc = new HWPFDocument(fis);
        Range rang = doc.getRange();
        sb.append(rang.text());
        fis.close();
        return sb.toString();

    }
    /**
     * 读取docx文件
     * @param filePath
     * @throws IOException
     */
    public static String getTextFromDocx(String filePath) throws IOException {
        FileInputStream in = new FileInputStream(filePath);
        XWPFDocument doc = new XWPFDocument(in);
        XWPFWordExtractor extractor = new XWPFWordExtractor(doc);
        String text = extractor.getText();
        in.close();
        return text;
    }
    /**
     * 读取pdf文件
     * @param filePath
     * @throws IOException
     */
    public static String getTextFromPDF(String filePath) throws IOException{
        File input = new File(filePath);
        PDDocument pd = PDDocument.load(input);
        PDFTextStripper stripper = new PDFTextStripper();
        return stripper.getText(pd);
    }
    /**
     * 读取ppt文件
     * @param filePath
     * @throws IOException
     */
    public static String getTextFromPPT( String filePath) throws IOException{
        FileInputStream in = new FileInputStream(filePath);
        PowerPointExtractor extractor = new PowerPointExtractor(in);
        String content = extractor.getText();
        extractor.close();
        return content;
    }
    /**
     * 读取pptx文件
     * @param filePath
     * @throws IOException
     */
    public static String getTextFromPPTX( String filePath) throws IOException{
        String resultString = null;
        StringBuilder sb = new StringBuilder();
        FileInputStream in = new FileInputStream(filePath);
        try {
            XMLSlideShow xmlSlideShow = new XMLSlideShow(in);
            List<XSLFSlide> slides = xmlSlideShow.getSlides();
            for(XSLFSlide slide:slides){
                CTSlide rawSlide = slide.getXmlObject();
                CTGroupShape gs = rawSlide.getCSld().getSpTree();
                CTShape[] shapes = gs.getSpArray();
                for(CTShape shape:shapes){
                    CTTextBody tb = shape.getTxBody();
                    if(null==tb){
                        continue;
                    }
                    CTTextParagraph[] paras = tb.getPArray();
                    for(CTTextParagraph textParagraph:paras){
                        CTRegularTextRun[] textRuns = textParagraph.getRArray();
                        for(CTRegularTextRun textRun:textRuns){
                            sb.append(textRun.getT());
                        }
                    }
                }
            }
            resultString = sb.toString();
            xmlSlideShow.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return resultString;
    }
    /**
     * 读取xls
     * @param filePath
     * @throws IOException
     */
    public static String getTextFromxls(String filePath) throws IOException{
        FileInputStream in = new FileInputStream(filePath);
        StringBuilder content = new StringBuilder();
        HSSFWorkbook workbook = new HSSFWorkbook(in);
        for(int sheetIndex=0;sheetIndex<workbook.getNumberOfSheets();sheetIndex++){
            HSSFSheet sheet = workbook.getSheetAt(sheetIndex);
            for(int rowIndex=0;rowIndex<=sheet.getLastRowNum();rowIndex++){
                HSSFRow row = sheet.getRow(rowIndex);
                if(row==null){
                    continue;
                }
                for(int cellnum=0;cellnum<row.getLastCellNum();cellnum++){
                    HSSFCell cell = row.getCell(cellnum);
                    if(cell!=null){
                        content.append(cell.getRichStringCellValue().getString()+" ");
                    }

                }
            }

        }
        workbook.close();
        return content.toString();

    }
    /**
     * 用来读取xlsx文件
     * @param filePath
     * @throws IOException
     */
    public static String getTextFromxlsx(String filePath) throws IOException{
        StringBuilder content = new StringBuilder();
        XSSFWorkbook workbook = new XSSFWorkbook(filePath);
        for(int sheet=0;sheet<workbook.getNumberOfSheets();sheet++){
            if(null!=workbook.getSheetAt(sheet)){
                XSSFSheet aSheet =workbook.getSheetAt(sheet);
                for(int row=0;row<=aSheet.getLastRowNum();row++){
                    if(null!=aSheet.getRow(row)){
                        XSSFRow aRow = aSheet.getRow(row);
                        for(int cell=0;cell<aRow.getLastCellNum();cell++){
                            if(null!=aRow.getCell(cell)){
                                XSSFCell aCell = aRow.getCell(cell);
                                if(convertCell(aCell).length()>0){
                                    content.append(convertCell(aCell));
                                }
                            }
                            content.append(" ");
                        }
                    }
                }
            }
        }
        workbook.close();
        return content.toString();

    }

    private static String (Cell cell){
        NumberFormat formater = NumberFormat.getInstance();
        formater.setGroupingUsed(false);
        String cellValue="";
        if(cell==null){
            return cellValue;
        }

        switch(cell.getCellType()){
            case HSSFCell.CELL_TYPE_NUMERIC:
                cellValue = formater.format(cell.getNumericCellValue());
                break;
            case HSSFCell.CELL_TYPE_STRING:
                cellValue = cell.getStringCellValue();
                break;
            case HSSFCell.CELL_TYPE_BLANK:
                cellValue = cell.getStringCellValue();
                break;
            case HSSFCell.CELL_TYPE_BOOLEAN:
                cellValue = Boolean.valueOf(cell.getBooleanCellValue()).toString();
                break;
            case HSSFCell.CELL_TYPE_ERROR:
                cellValue = String.valueOf(cell.getErrorCellValue());
                break;
            default:cellValue="";
        }
        return cellValue.trim();
    }

小周先生~

关注

2
点赞
踩
18

收藏

觉得还不错? 一键收藏
1
评论
Java解析pdf、doc、docx、ppt、xls等文件

本文是从一个大佬那里摘过来的，忘记是哪个了，在这里向大佬道个歉，并表示感谢，感谢大佬的技术分享，我就厚着脸皮发布了！????1.所需依赖<dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.2</version> </dependen
复制链接

扫一扫