需要使用的maven依赖
<!--注意版本保持一致 poi poi-ooxml poi-scratchpad-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.2</version>
</dependency>
<!-- 操作doc ppt xls -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.2</version>
</dependency>
<!-- 操作docx pptx xlsx -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
<version>2.0.2</version>
</dependency>
1.doc转html
package org.example;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.w3c.dom.Document;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
public class DocToHtml {
public static void main(String[] args) {
//doc文件使用HWPFDocument读取,docx文件使用XWPFDocument读取
String filePath="C:\\Users\\Administrator\\Desktop\\doc测试.doc";
File file = new File(filePath);
try {
FileInputStream inputStream = new FileInputStream(file);
HWPFDocument hwpfDocument = new HWPFDocument(inputStream);//构造函数放入文件流得到HWPFDocument对象
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());//构造WordToHtmlConverter对象
//开始解析doc文档---------------------------------------------
wordToHtmlConverter.processDocument(hwpfDocument);
Document document = wordToHtmlConverter.getDocument();
//通过TransformerFactory创造出Transformer ,并设置Transformer的属性
TransformerFactory transformerFactory = TransformerFactory.newInstance();
Transformer transformer = transformerFactory.newTransformer();
transformer.setOutputProperty(OutputKeys.ENCODING,"UTF-8");
transformer.setOutputProperty(OutputKeys.INDENT,"yes");
transformer.setOutputProperty(OutputKeys.METHOD,"html");
//transformer.transform()需要参数1 Source 参数2 Result
DOMSource domSource = new DOMSource(document);
ByteArrayOutputStream outputtarget = new ByteArrayOutputStream();
StreamResult streamResult = new StreamResult(outputtarget);
//开始转换,结果数据在ByteArrayOutputStream里
transformer.transform(domSource,streamResult);//参数1 Source 参数2 Result
//转成字符串看看
String string = outputtarget.toString("utf-8");
System.out.println(string);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}
输出:
<html>
<head>
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
<style type="text/css">.b1{white-space-collapsing:preserve;}
.b2{margin: 1.0in 1.25in 1.0in 1.25in;}
.s1{vertical-align:super;font-size:smaller;}
.s2{font-weight:bold;}
.s3{font-size:16pt;}
.s4{font-size:22pt;font-weight:bold;}
.p1{text-align:justify;hyphenate:auto;font-family:Calibri;font-size:10pt;}
</style>
<meta content="Administrator" name="author">
</head>
<body class="b1 b2">
<p class="p1">
<span>X</span><span class="s1">2</span>
</p>
<p class="p1"></p>
<p class="p1">
<span>hhhhhhhhhh</span><span class="s2">hhhhhh</span><span>hhhh</span><span class="s3">hhhh</span><span>hhh</span><span class="s4">h</span><span>hh</span>
</p>
</body>
</html>
2.docx转html
package org.example;
import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
public class DocxToHtml {
public static void main(String[] args) {
String filePath="C:\\Users\\Administrator\\Desktop\\docx测试.docx";
File file = new File(filePath);
try {
FileInputStream inputStream = new FileInputStream(file);
//创建操作docx word的对象
XWPFDocument xwpfDocument = new XWPFDocument(inputStream);
//解析XHTML配置
XHTMLOptions xhtmlOptions = XHTMLOptions.create();
// 将样式都写为内联样式,而不是写到style标签中 默认false
xhtmlOptions.setFragment(true);
xhtmlOptions.setIgnoreStylesIfUnused(false);
xhtmlOptions.setImageManager(new Base64EmbedImgManager());//图片用base64转化
//将XWPFDocument转化成HTML
ByteArrayOutputStream outputtarget = new ByteArrayOutputStream();
XHTMLConverter.getInstance().convert(xwpfDocument,outputtarget,xhtmlOptions);
//转成字符串看看
String string = outputtarget.toString("utf-8");
System.out.println(string);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}
输出:
<div style="width:595.3pt;margin-bottom:72.0pt;margin-top:72.0pt;margin-left:90.0pt;margin-right:90.0pt;"><p style="white-space:pre-wrap;"><span style="white-space:pre-wrap;">X</span><span style="font-family:'Calibri';font-size:10.0pt;vertical-align:super;">2</span></p><p style="white-space:pre-wrap;"><br/></p><p style="white-space:pre-wrap;"><span style="white-space:pre-wrap;">sjhhfios</span><span style="font-weight:bold;white-space:pre-wrap;">afjoajdp</span><span style="white-space:pre-wrap;">asj</span><span id="_GoBack"/></p></div>
格式化一下好看:
<div style="width:595.3pt;margin-bottom:72.0pt;margin-top:72.0pt;margin-left:90.0pt;margin-right:90.0pt;">
<p style="white-space:pre-wrap;"><span style="white-space:pre-wrap;">X</span><span
style="font-family:'Calibri';font-size:10.0pt;vertical-align:super;">2</span></p>
<p style="white-space:pre-wrap;"><br /></p>
<p style="white-space:pre-wrap;"><span style="white-space:pre-wrap;">sjhhfios</span><span
style="font-weight:bold;white-space:pre-wrap;">afjoajdp</span><span
style="white-space:pre-wrap;">asj</span><span id="_GoBack" /></p>
</div>