Java使用工具将word转换为html_java word转html包含样式-CSDN博客

本文链接：https://blog.csdn.net/weixin_42471125/article/details/140842067

maven支持：

<properties>
	<poi.version>5.2.3</poi.version>
	<xhtml.version>2.0.4</xhtml.version>
<properties>
 <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>${poi.version}</version>
        </dependency>
        <!--word转html-->
        <dependency>
            <groupId>fr.opensagres.xdocreport</groupId>
            <artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
            <version>${xhtml.version}</version>
        </dependency>
        <!--处理office文档表格相关 2007+版-->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>${poi.version}</version>
        </dependency>
        <!--处理office文档表格相关 2003版-->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>${poi.version}</version>
        </dependency>

工具类：

package com.doc2html;

import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.binary.Base64;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Paths;

@Slf4j
public class W3 {
    public static void main(String[] args) throws IOException {
        String path = "E://test//xxx.docx";
        String format = "<!html>%s</html>";
        //转换出来是文本，没html头尾标签，可自行拼接
        String htmlContent = docxToHtml(path);
        //拼接文本，可自由拼接
        String content = String .format(format,htmlContent );
        //输出为html文件
        write2Hmtl(content, "E://test//xxx.html");
    }

    /**
     * Word2007(docx)格式转html
     *
     * @param filePath 文件路径
     * @return 返回转成String类型的html字符串
     * @throws
     */
    public static String docxToHtml(String filePath) {

        try (ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
             XWPFDocument docxDocument = new XWPFDocument(Files.newInputStream(Paths.get(filePath)))) {
            XHTMLOptions options = XHTMLOptions.create();
            // 是否忽略未使用的样式
            options.setIgnoreStylesIfUnused(false);
            // 设置片段模式，<div>标签包裹
            options.setFragment(true);
            // 图片转base64
            options.setImageManager(new Base64EmbedImgManager());
            // 转换htm1
            XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);
            return htmlStream.toString();
        } catch (Exception e) {
            log.error("Word转Html过程出现异常！", e);
        }
        return null;
    }

    public static void write2Hmtl(String content, String path) throws IOException {
        try {
            new File(path).createNewFile();
        } catch (IOException e) {

        }
        FileWriter fileWriter = new FileWriter(path);
        fileWriter.write(content);
        fileWriter.flush();
        fileWriter.close();
    }

    /**
     * Word2003(doc)格式转html
     *
     * @param filePath 文件路径
     * @return 返回转成String类型的html字符串
     * @throws Exception
     */
    public static String docToHtml(String filePath) {
        try (StringWriter writer = new StringWriter();
             HWPFDocument document = new HWPFDocument(Files.newInputStream(new File(filePath).toPath()))) {
            WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
            //将图片转成base64的格式
            wordToHtmlConverter.setPicturesManager((bytes, pictureType, s, v, v1) -> "data:image/png;base64," + Base64.encodeBase64String(bytes));
            wordToHtmlConverter.processDocument(document);
            org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();
            DOMSource domSource = new DOMSource(htmlDocument);
            TransformerFactory factory = TransformerFactory.newInstance();
            Transformer serializer = factory.newTransformer();
            serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
            serializer.setOutputProperty(OutputKeys.INDENT, "yes");
            serializer.setOutputProperty(OutputKeys.METHOD, "html");
            serializer.transform(domSource, new StreamResult(writer));
            return writer.toString();
        } catch (Exception e) {
            log.error("Word转Html过程出现异常！", e);
        }
        return null;
    }

    /**
     * word 转 html
     * 自动检测文件格式转换
     *
     * @param filePath 文件本地路径
     * @return 成功返回转换后的html字符串；失败返回null
     */
    public static String autoWord2Html(String filePath) {
        int lastIndexOf = filePath.lastIndexOf(".");
        String suffix = filePath.substring(lastIndexOf + 1);
        if ("doc".equalsIgnoreCase(suffix)) {
            return docToHtml(filePath);
        } else if ("docx".equalsIgnoreCase(suffix)) {
            return docxToHtml(filePath);
        } else {
            log.info("文件格式错误，只支持Docx和Doc格式的文档！");
            return null;
        }
    }
}