java将word内容转化成html

boymusic

于 2023-12-18 18:12:46 发布

阅读量366

点赞数 10

文章标签： java word html

本文链接：https://blog.csdn.net/x1778161229/article/details/135068749

版权

引入依赖

		<dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>4.1.2</version>
            <exclusions>
                <exclusion>
                    <groupId>org.apache.poi</groupId>
                    <artifactId>poi-xml</artifactId>
                </exclusion>
            </exclusions>
        </dependency>

        <!-- 针对2007以上版本的库 -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>4.1.2</version>
        </dependency>
        <!-- 针对2003版本的库 -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>4.1.2</version>
        </dependency>
        <dependency>
            <groupId>fr.opensagres.xdocreport</groupId>
            <artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
            <version>2.0.2</version>
        </dependency>

        <dependency>
            <groupId>xerces</groupId>
            <artifactId>xercesImpl</artifactId>
            <version>2.12.0</version>
        </dependency>

OfficeConvertUtil 工具类

import cn.hutool.core.img.ImgUtil;
import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.IOException;

public class OfficeConvertUtil {



    /**
     * 将word2003转换为html文件

     * @throws IOException
     * @throws TransformerException
     * @throws ParserConfigurationException
     */
    public static String word2003ToHtml(HWPFDocument wordDocument) throws TransformerException,
            ParserConfigurationException {

        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                DocumentBuilderFactory.newInstance().newDocumentBuilder()
                        .newDocument());
        wordToHtmlConverter.setPicturesManager((content, pictureType, suggestedName, widthInches, heightInches) -> {
            BufferedImage bufferedImage = ImgUtil.toImage(content);
            String base64Img = ImgUtil.toBase64(bufferedImage, pictureType.getExtension());
            //  带图片的word，则将图片转为base64编码，保存在一个页面中
            StringBuilder sb = (new StringBuilder(base64Img.length() + "data:;base64,".length()).append("data:;base64,").append(base64Img));
            return sb.toString();
        });
        // 解析word文档
        wordToHtmlConverter.processDocument(wordDocument);
        Document htmlDocument = wordToHtmlConverter.getDocument();
        ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(outputStream);
        TransformerFactory factory = TransformerFactory.newInstance();
        Transformer serializer = factory.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(domSource, streamResult);
        return outputStream.toString();
    }

    /**
     * 2007版本word转换成html
     *
     * @param document word文件路径
     * @return
     * @throws IOException
     */
    public static String word2007ToHtml(XWPFDocument document)
            throws IOException {
        //  带图片的word，则将图片转为base64编码，保存在一个页面中
        XHTMLOptions options = XHTMLOptions.create().indent(4).setImageManager(new Base64EmbedImgManager());
        // ) 将 XWPFDocument转换成XHTML
        // 生成html文件上级文件夹
        ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
        XHTMLConverter.getInstance().convert(document, outputStream, options);
        return outputStream.toString();
    }

}

输出内容

	    String content ;
         FileInputStream fis = new FileInputStream(file);
         if ( param.getPath().endsWith(".docx") ){
             XWPFDocument document = new XWPFDocument(fis);
             content = OfficeConvertUtil.word2007ToHtml(document);
             document.close();
         }else{
             HWPFDocument wordDocument = new HWPFDocument(fis);
             content = OfficeConvertUtil.word2003ToHtml(wordDocument);
             wordDocument.close();
         }
         // 关闭资源
         fis.close();
         content = content.replaceAll("\n","<br/>");