JAVA使用poi实现word转html

最新推荐文章于 2024-08-19 16:49:49 发布

非ban必选

最新推荐文章于 2024-08-19 16:49:49 发布

阅读量348

点赞数

分类专栏： SpringBoot Java和Jvm

本文链接：https://blog.csdn.net/zsj777/article/details/119518068

版权

Java和Jvm 同时被 2 个专栏收录

116 篇文章 0 订阅

订阅专栏

SpringBoot

83 篇文章 1 订阅

订阅专栏


            <!-- 针对2007以上版本的库 -->
            <dependency>
                <groupId>org.apache.poi</groupId>
                <artifactId>poi-ooxml</artifactId>
                <version>4.1.2</version>
            </dependency>
            <!-- 针对2003版本的库 -->
            <dependency>
                <groupId>org.apache.poi</groupId>
                <artifactId>poi-scratchpad</artifactId>
                <version>4.1.2</version>
            </dependency>

            <dependency>
                <groupId>org.apache.poi</groupId>
                <artifactId>poi</artifactId>
                <version>4.1.2</version>
            </dependency>

            <dependency>
                <groupId>fr.opensagres.xdocreport</groupId>
                <artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
                <version>2.0.2</version>
            </dependency>

java工具类

package com.luding.common.utils;

import cn.hutool.core.img.ImgUtil;
import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.openxml4j.util.ZipSecureFile;
import org.apache.poi.xwpf.usermodel.IBodyElement;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.List;
import java.util.UUID;

/**
 * office转换工具测试
 */
public class OfficeConvertUtil {

    /**
     * 将word2003转换为html文件 2017-2-27
     * doc
     *
     * @param wordPath word文件路径
     * @param wordName word文件名称无后缀
     * @param suffix   word文件后缀
     * @throws IOException
     * @throws TransformerException
     * @throws ParserConfigurationException
     */
    public static String word2003ToHtml(String wordPath, String wordName,
                                        String suffix) throws IOException, TransformerException,
            ParserConfigurationException {
        String htmlPath = wordPath + File.separator + "out"
                + File.separator;
        String htmlName = wordName + ".html";
        // 判断html文件是否存在，每次重新生成
        File htmlFile = new File(htmlPath + htmlName);
        // 原word文档
        final String file = wordPath + File.separator + wordName + suffix;
        InputStream input = new FileInputStream(new File(file));
        HWPFDocument wordDocument = new HWPFDocument(input);
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                DocumentBuilderFactory.newInstance().newDocumentBuilder()
                        .newDocument());

        wordToHtmlConverter.setPicturesManager((content, pictureType, suggestedName, widthInches, heightInches) -> {
            BufferedImage bufferedImage = ImgUtil.toImage(content);
            String base64Img = ImgUtil.toBase64(bufferedImage, pictureType.getExtension());
            //  带图片的word，则将图片转为base64编码，保存在一个页面中
            StringBuilder sb = (new StringBuilder(base64Img.length() + "data:;base64,".length()).append("data:;base64,").append(base64Img));
            return sb.toString();
        });

        // 解析word文档
        wordToHtmlConverter.processDocument(wordDocument);
        Document htmlDocument = wordToHtmlConverter.getDocument();
        // 生成html文件上级文件夹
        File folder = new File(htmlPath);
        if (!folder.exists()) {
            folder.mkdirs();
        }

        // 生成html文件地址
        OutputStream outStream = new FileOutputStream(htmlFile);

        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(outStream);

        TransformerFactory factory = TransformerFactory.newInstance();
        Transformer serializer = factory.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");

        serializer.transform(domSource, streamResult);

        outStream.close();

        return htmlFile.getAbsolutePath();
    }

    /**
     * 2007版本word转换成html 2017-2-27
     *
     * @param wordPath word文件路径
     * @param wordName word文件名称无后缀
     * @param suffix   word文件后缀
     * @return .docx
     * @throws IOException
     */
    public static String word2007ToHtml(String wordPath, String wordName, String suffix)
            throws IOException {
        ZipSecureFile.setMinInflateRatio(-1.0d);
        String htmlPath = wordPath + File.separator + "out"
                + File.separator;
        String htmlName = wordName + ".html";
        // 判断html文件是否存在
        File htmlFile = new File(htmlPath + htmlName);

        // word文件
        File wordFile = new File(wordPath + File.separator + wordName + suffix);

        // 1) 加载word文档生成 XWPFDocument对象
        InputStream in = new FileInputStream(wordFile);
        XWPFDocument document = new XWPFDocument(in);
        //  带图片的word，则将图片转为base64编码，保存在一个页面中
        XHTMLOptions options = XHTMLOptions.create().indent(4).setImageManager(new Base64EmbedImgManager());
        // 3) 将 XWPFDocument转换成XHTML
        // 生成html文件上级文件夹
        File folder = new File(htmlPath);
        if (!folder.exists()) {
            folder.mkdirs();
        }
        OutputStream out = new FileOutputStream(htmlFile);
//        XHTMLConverter.getInstance().convert(document, out, options);
        //Windows 服务器上 utf-8乱码
        OutputStreamWriter writer = new OutputStreamWriter(out, "gbk");//自定义编码
        XHTMLConverter instance = (XHTMLConverter) XHTMLConverter.getInstance();
        instance.convert(document, writer, options);
        return htmlFile.getAbsolutePath();
    }




    public static void main(String[] args) throws Exception {
        System.out.println(word2003ToHtml("D:\\doc", "2", ".doc"));
        System.out.println(word2007ToHtml("D:\\doc", "1", ".docx"));
    }

}