maven支持:
<properties>
<poi.version>5.2.3</poi.version>
<xhtml.version>2.0.4</xhtml.version>
<properties>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>${poi.version}</version>
</dependency>
<!--word转html-->
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
<version>${xhtml.version}</version>
</dependency>
<!--处理office文档表格相关 2007+版-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>${poi.version}</version>
</dependency>
<!--处理office文档表格相关 2003版-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>${poi.version}</version>
</dependency>
工具类:
package com.doc2html;
import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.binary.Base64;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Paths;
@Slf4j
public class W3 {
public static void main(String[] args) throws IOException {
String path = "E://test//xxx.docx";
String format = "<!html>%s</html>";
//转换出来是文本,没html头尾标签,可自行拼接
String htmlContent = docxToHtml(path);
//拼接文本,可自由拼接
String content = String .format(format,htmlContent );
//输出为html文件
write2Hmtl(content, "E://test//xxx.html");
}
/**
* Word2007(docx)格式转html
*
* @param filePath 文件路径
* @return 返回转成String类型的html字符串
* @throws
*/
public static String docxToHtml(String filePath) {
try (ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
XWPFDocument docxDocument = new XWPFDocument(Files.newInputStream(Paths.get(filePath)))) {
XHTMLOptions options = XHTMLOptions.create();
// 是否忽略未使用的样式
options.setIgnoreStylesIfUnused(false);
// 设置片段模式,<div>标签包裹
options.setFragment(true);
// 图片转base64
options.setImageManager(new Base64EmbedImgManager());
// 转换htm1
XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);
return htmlStream.toString();
} catch (Exception e) {
log.error("Word转Html过程出现异常!", e);
}
return null;
}
public static void write2Hmtl(String content, String path) throws IOException {
try {
new File(path).createNewFile();
} catch (IOException e) {
}
FileWriter fileWriter = new FileWriter(path);
fileWriter.write(content);
fileWriter.flush();
fileWriter.close();
}
/**
* Word2003(doc)格式转html
*
* @param filePath 文件路径
* @return 返回转成String类型的html字符串
* @throws Exception
*/
public static String docToHtml(String filePath) {
try (StringWriter writer = new StringWriter();
HWPFDocument document = new HWPFDocument(Files.newInputStream(new File(filePath).toPath()))) {
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
//将图片转成base64的格式
wordToHtmlConverter.setPicturesManager((bytes, pictureType, s, v, v1) -> "data:image/png;base64," + Base64.encodeBase64String(bytes));
wordToHtmlConverter.processDocument(document);
org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();
DOMSource domSource = new DOMSource(htmlDocument);
TransformerFactory factory = TransformerFactory.newInstance();
Transformer serializer = factory.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, new StreamResult(writer));
return writer.toString();
} catch (Exception e) {
log.error("Word转Html过程出现异常!", e);
}
return null;
}
/**
* word 转 html
* 自动检测文件格式转换
*
* @param filePath 文件本地路径
* @return 成功返回转换后的html字符串;失败返回null
*/
public static String autoWord2Html(String filePath) {
int lastIndexOf = filePath.lastIndexOf(".");
String suffix = filePath.substring(lastIndexOf + 1);
if ("doc".equalsIgnoreCase(suffix)) {
return docToHtml(filePath);
} else if ("docx".equalsIgnoreCase(suffix)) {
return docxToHtml(filePath);
} else {
log.info("文件格式错误,只支持Docx和Doc格式的文档!");
return null;
}
}
}