【文档解析工具类】

远离bug，珍爱头发

已于 2024-09-14 09:14:19 修改

阅读量415

点赞数 4

文章标签： java

于 2024-09-13 16:10:33 首次发布

本文链接：https://blog.csdn.net/qq_45925787/article/details/142213290

版权

文档解析工具类

- 数据填充word模板（‘{}’占位符填充）
- 读取不同格式文件的页数

可能用到的maven依赖

    <dependencies>
        <dependency>
            <groupId>com.itextpdf</groupId>
            <artifactId>itextpdf</artifactId>
            <version>5.0.6</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.poi/poi -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>4.1.2</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>4.1.2</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-scratchpad -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>4.1.2</version>
        </dependency>
    </dependencies>

数据填充word模板（‘{}’占位符填充）

  /**
     * 将数据填充到word文档中
     *
     * @param templateDoc
     * @param data
     * @param <T>
     * @return
     */
    public static <T> XWPFDocument fillTemplate(XWPFDocument templateDoc, T data) {
        Field[] fields = data.getClass().getDeclaredFields();
        for (Field field : fields) {
            field.setAccessible(true); // 设置属性可访问
        }

        // 替换段落中的占位符
        for (XWPFParagraph paragraph : templateDoc.getParagraphs()) {
            replacePlaceholder(paragraph, fields, data);
        }

        // 替换表格中的占位符
        for (XWPFTable table : templateDoc.getTables()) {
            for (XWPFTableRow row : table.getRows()) {
                for (XWPFTableCell cell : row.getTableCells()) {
                    replacePlaceholderInCell(cell, fields, data);
                }
            }
        }

        return templateDoc;
    }

    // 替换段落中的占位符
    private static <T> void replacePlaceholder(XWPFParagraph paragraph, Field[] fields, T data) {
        for (XWPFRun run : paragraph.getRuns()) {
            String text = run.getText(0);
            if (text != null && text.contains("{")) {
                for (Field field : fields) {
                    String fieldName = field.getName();
                    if (text.contains("{" + fieldName + "}")) {
                        try {
                            Object value = field.get(data);
                            text = text.replace("{" + fieldName + "}", value != null ? value.toString() : " ");
                            run.setText(text, 0);
                        } catch (IllegalAccessException e) {
                            log.error("生成底稿失败: " + e.getMessage());
                        }
                    }
                }
            }
        }
    }

    // 替换单元格中的占位符
    private static <T> void replacePlaceholderInCell(XWPFTableCell cell, Field[] fields, T data) {
        for (XWPFParagraph paragraph : cell.getParagraphs()) {
            for (XWPFRun run : paragraph.getRuns()) {
                String text = run.getText(0);
                if (text != null && text.contains("{") && text.contains("}")) {
                    for (Field field : fields) {
                        String fieldName = field.getName();
                        if (text.contains("{" + fieldName + "}")) {
                            try {
                                Object value = field.get(data);
                                String replacedText = text.replace("{" + fieldName + "}", value != null ? value.toString() : "");

                                // 处理换行符
                                if (replacedText.contains("\n")) {
                                    handleTextWithLineBreaks(run, replacedText);
                                } else {
                                    run.setText(replacedText, 0);
                                }

                            } catch (IllegalAccessException e) {
                                log.error("Error accessing field: " + e.getMessage());
                            }
                        }
                    }
                }
            }
        }
    }

    // 处理包含换行符的文本，将其分行显示
    private static void handleTextWithLineBreaks(XWPFRun run, String text) {
        // 分割文本，处理 \n 或 \r\n 换行符
        String[] lines = text.split("\\r?\\n");
        // 清空已有的内容
        run.setText("", 0); // 清除当前run中的文本
        // 添加新文本和换行符
        for (int i = 0; i < lines.length; i++) {
            if (i > 0) {
                run.addBreak(); // 在每一行之后插入换行符
            }
            run.setText(lines[i], i == 0 ? 0 : run.getTextPosition());
        }
    }

读取不同格式文件的页数

/**
     * 将文档转为pdf 返回文件页数
     * @param docUrl
     * @return
     */
    public static int convertDocToPdfAndGetPageCount(String docUrl) {
        try {
            // 从在线地址下载文档
            HttpURLConnection connection = (HttpURLConnection) new URL(docUrl).openConnection();
            connection.setRequestMethod("GET");
            InputStream inputStream = connection.getInputStream();

            // 获取文件名和后缀
            String fileName = new File(docUrl).getName();
            String fileExtension = fileName.substring(fileName.lastIndexOf(".") + 1).toLowerCase();

            // 根据文件后缀执行不同的处理方式
            switch (fileExtension) {
                case "pdf":
                    // 如果文件已经是 PDF 格式，直接计算页数
                    return countPdfPage(inputStream);
                case "docx": {
                    // 如果是 docx 文件，使用 XWPFDocument 读取并转换为 PDF

                    return countWord2007Page(inputStream);
                }
                case "doc": {
                    // 如果是 doc 文件，使用 HWPFDocument 读取并转换为 PDF
                    return countWord2003Page(inputStream);
                }
                case "ppt": {
                    return countPPTPage(inputStream);
                }
                case "pptx": {
                    return countPPTXPage(inputStream);
                }
                default:
                    //其他格式当1页处理
                    return 1;
            }

        } catch (Exception e) {
            log.error("解析文档错误：{}",e.getMessage());
            return 1;
        }
    }

 

    /**
     * 计算PDF格式文档的页数
     */
    public static int countPdfPage(InputStream fileInputStream) {
        int pageCount = 0;
        PdfReader reader = null;
        try {
            reader = new PdfReader(fileInputStream);
            pageCount = reader.getNumberOfPages();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            reader.close();
        }
        return pageCount;
    }

    /**
     * 计算PPTX格式文档的页数
     * @param fileInputStream
     * @return
     * @throws IOException
     */
    public static int countPPTPage(InputStream fileInputStream) throws IOException {
        int pageCount = 0;
        ZipSecureFile.setMinInflateRatio(-1.0d);

        HSLFSlideShow hslfSlideShow = new HSLFSlideShow(fileInputStream);
        try {
            pageCount = hslfSlideShow.getSlides().size();
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            fileInputStream.close();
        }
        return pageCount;

    }

    /**
     * 计算PPTX格式文档的页数
     */
    public static int countPPTXPage(InputStream fileInputStream) throws IOException {
        int pageCount = 0;
        ZipSecureFile.setMinInflateRatio(-1.0d);
        try {
            XMLSlideShow pptxFile = new XMLSlideShow(fileInputStream);
            pageCount = pptxFile.getSlides().size();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            fileInputStream.close();
        }
        return pageCount;
    }

    /**
     * 计算WORD2007(*.docx)格式文档的页数
     */
    public static int countWord2007Page(InputStream fileInputStream) throws IOException {
        int pageCount = 0;
        ZipSecureFile.setMinInflateRatio(-1.0d);
        XWPFDocument docx = null;
        try {
            docx = new XWPFDocument(fileInputStream);
            pageCount = docx.getProperties().getExtendedProperties().getUnderlyingProperties().getPages();//总页数
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            docx.close();
        }
        return pageCount;
    }

    /**
     * 计算WORD2003(*.doc)格式文档的页数
     */
    public static int countWord2003Page(InputStream fileInputStream) throws IOException {
        int pageCount = 0;
        WordExtractor doc = null;
        ZipSecureFile.setMinInflateRatio(-1.0d);
        try {
            doc = new WordExtractor(fileInputStream);//.doc格式Word文件提取器
            pageCount = doc.getSummaryInformation().getPageCount();//总页数
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            doc.close();
        }
        return pageCount;
    }