使用java将doc/docx转换成html

需要使用的maven依赖

        <!--注意版本保持一致 poi poi-ooxml  poi-scratchpad-->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>4.1.2</version>
        </dependency>
        <!-- 操作doc ppt xls  -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>4.1.2</version>
        </dependency>
        <!-- 操作docx pptx xlsx  -->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>4.1.2</version>
        </dependency>

        <dependency>
            <groupId>fr.opensagres.xdocreport</groupId>
            <artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
            <version>2.0.2</version>
        </dependency>

1.doc转html

package org.example;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.w3c.dom.Document;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;



public class DocToHtml {
    public static void main(String[] args) {
        //doc文件使用HWPFDocument读取,docx文件使用XWPFDocument读取
        String filePath="C:\\Users\\Administrator\\Desktop\\doc测试.doc";
        File file = new File(filePath);
        try {
            FileInputStream inputStream = new FileInputStream(file);
            HWPFDocument hwpfDocument = new HWPFDocument(inputStream);//构造函数放入文件流得到HWPFDocument对象
            WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());//构造WordToHtmlConverter对象

            //开始解析doc文档---------------------------------------------
            wordToHtmlConverter.processDocument(hwpfDocument);
            Document document = wordToHtmlConverter.getDocument();
            //通过TransformerFactory创造出Transformer ,并设置Transformer的属性
            TransformerFactory transformerFactory = TransformerFactory.newInstance();
            Transformer transformer = transformerFactory.newTransformer();
            transformer.setOutputProperty(OutputKeys.ENCODING,"UTF-8");
            transformer.setOutputProperty(OutputKeys.INDENT,"yes");
            transformer.setOutputProperty(OutputKeys.METHOD,"html");

            //transformer.transform()需要参数1 Source 参数2 Result
            DOMSource domSource = new DOMSource(document);
            ByteArrayOutputStream outputtarget = new ByteArrayOutputStream();
            StreamResult streamResult = new StreamResult(outputtarget);
            //开始转换,结果数据在ByteArrayOutputStream里
            transformer.transform(domSource,streamResult);//参数1 Source 参数2 Result

            //转成字符串看看
            String string = outputtarget.toString("utf-8");
            System.out.println(string);


        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }
}

输出:


<html>
<head>
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
<style type="text/css">.b1{white-space-collapsing:preserve;}
.b2{margin: 1.0in 1.25in 1.0in 1.25in;}
.s1{vertical-align:super;font-size:smaller;}
.s2{font-weight:bold;}
.s3{font-size:16pt;}
.s4{font-size:22pt;font-weight:bold;}
.p1{text-align:justify;hyphenate:auto;font-family:Calibri;font-size:10pt;}
</style>
<meta content="Administrator" name="author">
</head>
<body class="b1 b2">
<p class="p1">
<span>X</span><span class="s1">2</span>
</p>
<p class="p1"></p>
<p class="p1">
<span>hhhhhhhhhh</span><span class="s2">hhhhhh</span><span>hhhh</span><span class="s3">hhhh</span><span>hhh</span><span class="s4">h</span><span>hh</span>
</p>
</body>
</html>

2.docx转html

package org.example;

import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;


public class DocxToHtml {
    public static void main(String[] args) {

        String filePath="C:\\Users\\Administrator\\Desktop\\docx测试.docx";
        File file = new File(filePath);
        try {
            FileInputStream inputStream = new FileInputStream(file);
            //创建操作docx word的对象
            XWPFDocument xwpfDocument = new XWPFDocument(inputStream);
            //解析XHTML配置
            XHTMLOptions xhtmlOptions = XHTMLOptions.create();
            // 将样式都写为内联样式,而不是写到style标签中 默认false
            xhtmlOptions.setFragment(true);
            xhtmlOptions.setIgnoreStylesIfUnused(false);
            xhtmlOptions.setImageManager(new Base64EmbedImgManager());//图片用base64转化

            //将XWPFDocument转化成HTML
            ByteArrayOutputStream outputtarget = new ByteArrayOutputStream();
            XHTMLConverter.getInstance().convert(xwpfDocument,outputtarget,xhtmlOptions);

            //转成字符串看看
            String string = outputtarget.toString("utf-8");
            System.out.println(string);


        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }
}

输出:

<div style="width:595.3pt;margin-bottom:72.0pt;margin-top:72.0pt;margin-left:90.0pt;margin-right:90.0pt;"><p style="white-space:pre-wrap;"><span style="white-space:pre-wrap;">X</span><span style="font-family:'Calibri';font-size:10.0pt;vertical-align:super;">2</span></p><p style="white-space:pre-wrap;"><br/></p><p style="white-space:pre-wrap;"><span style="white-space:pre-wrap;">sjhhfios</span><span style="font-weight:bold;white-space:pre-wrap;">afjoajdp</span><span style="white-space:pre-wrap;">asj</span><span id="_GoBack"/></p></div>

格式化一下好看:

<div style="width:595.3pt;margin-bottom:72.0pt;margin-top:72.0pt;margin-left:90.0pt;margin-right:90.0pt;">
        <p style="white-space:pre-wrap;"><span style="white-space:pre-wrap;">X</span><span
                        style="font-family:'Calibri';font-size:10.0pt;vertical-align:super;">2</span></p>
        <p style="white-space:pre-wrap;"><br /></p>
        <p style="white-space:pre-wrap;"><span style="white-space:pre-wrap;">sjhhfios</span><span
                        style="font-weight:bold;white-space:pre-wrap;">afjoajdp</span><span
                        style="white-space:pre-wrap;">asj</span><span id="_GoBack" /></p>
</div>

评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Java-请多指教

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值