/**
* word2007和word2003的构建方式不同,
* 前者的构建方式是xml,后者的构建方式是dom树
* 文件的后缀也不同,前者后缀为.docx,后者后缀为.doc
*/
public String convertToHtml(InputStream inputStream, String fileName) throws Exception {
String substring = fileName.substring(fileName.lastIndexOf(".") + 1);
ByteArrayOutputStream out = new ByteArrayOutputStream();
//word07文档
if ("docx".equals(substring)) {
XWPFDocument document = new XWPFDocument(PackageHelper.open(inputStream));
//获取文档中的图片
List<XWPFPictureData> allPictures = document.getAllPictures();
for (XWPFPictureData xwpfPictureData : allPictures) {
String name = xwpfPictureData.getFileName();
byte[] data = xwpfPictureData.getData();
InputStream input = new ByteArrayInputStream(data);
// TODO 图片处理
}
final String imageUrl = "";
XHTMLOptions options = XHTMLOptions.create();
//不把图片生成出来
options.setExtractor(null);
options.setIgnoreStylesIfUnused(false);
options.setFragment(true);
options.URIResolver(new IURIResolver() {
//@Override
public String resolve(String uri) {
return imageUrl + uri;
}
});
//转换
XHTMLConverter.getInstance().convert(document, out, options);
} else {
//word03文档
HWPFDocument wordDocument = new HWPFDocument(inputStream);
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument());
wordToHtmlConverter.setPicturesManager(new PicturesManager() {
public String savePicture(byte[] content,
PictureType pictureType, String suggestedName,
float widthInches, float heightInches) {
//给生成的页面写图片的路径
return "word/media/" + suggestedName;
}
});
wordToHtmlConverter.processDocument(wordDocument);
//获取文档中的图片
List pics = wordDocument.getPicturesTable().getAllPictures();
if (pics != null) {
for (int i = 0; i < pics.size(); i++) {
Picture pic = (Picture) pics.get(i);
byte[] byteArr = pic.getContent();
InputStream input = new ByteArrayInputStream(byteArr);
// TODO 图片处理
}
}
Document htmlDocument = wordToHtmlConverter.getDocument();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(out);
//转换html文件
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
}
out.close();
//转化数据流,替换特殊字符
return StringEscapeUtils.escapeHtml(out.toString());
}
POI将Word文档转换成Html
最新推荐文章于 2024-08-19 16:49:49 发布