import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.converter.WordToHtmlUtils;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import com.maiyue.base.utils.FileUtils;
import fr.opensagres.poi.xwpf.converter.core.BasicURIResolver;
import fr.opensagres.poi.xwpf.converter.core.FileImageExtractor;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
/**
* word文档转html工具
*
* @author chen
*
*/
public class POIWordToHtmlUtils {
private static Logger logger = LoggerFactory.getLogger(POIWordToHtmlUtils.class);
/**
* .doc的word文档转换为html
*
* @return
* @throws Exception
*/
public static String docWordToHtml(String sourceFilePath, String targetFilePath) {
FileUtils.createFileFolder(targetFilePath);
try {
InputStream input = new FileInputStream(sourceFilePath);
HWPFDocumentCore wordDocument = WordToHtmlUtils.loadDoc(input);
WordToHtmlConverter wordToHtmlConverter = new WordImageToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument());
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(new File(targetFilePath));
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
return targetFilePath;
} catch (Exception e) {
logger.error(".doc的word文档转换为html,发生异常,源文件={},目标文件={}", sourceFilePath, targetFilePath, e);
return null;
}
}
/**
* .docx的word文档转换为html
*
* @param sourceFilePath
* @param targetFilePath
* @param imagePath
* @return
*/
public static String docxToHtml(String sourceFilePath, String targetFilePath, String imagePath) {
FileUtils.createFileFolder(targetFilePath);
OutputStreamWriter outputStreamWriter = null;
try {
InputStream input = new FileInputStream(sourceFilePath);
XWPFDocument document = new XWPFDocument(input);
XHTMLOptions options = XHTMLOptions.create();
// 存放图片的文件夹
options.setExtractor(new FileImageExtractor(new File(imagePath)));
// html中图片的路径
options.URIResolver(new BasicURIResolver("image"));
outputStreamWriter = new OutputStreamWriter(new FileOutputStream(targetFilePath), "utf-8");
XHTMLConverter xhtmlConverter = (XHTMLConverter) XHTMLConverter.getInstance();
xhtmlConverter.convert(document, outputStreamWriter, options);
return targetFilePath;
} catch (Exception e) {
logger.error(".docx的word文档转换为html,发生异常,源文件={},目标文件={}", sourceFilePath, targetFilePath, e);
return null;
} finally {
if (outputStreamWriter != null) {
try {
outputStreamWriter.close();
} catch (IOException e) {
logger.error(".docx的word文档转换为html,关闭流发生异常,源文件={},目标文件={}", sourceFilePath, targetFilePath, e);
return null;
}
}
}
}
/*public static void main(String[] args) throws Exception {
docToHtml("D:/diagnosis/file/temp/test2003.doc", "D:/diagnosis/file/temp/test1.html");
String imagePath = "D:/diagnosis/file/temp/image";
String sourceFileName = "D:/diagnosis/file/temp/test2007.docx";
String targetFileName = "D:/diagnosis/file/temp/test2.html";
docxToHtml(sourceFileName, targetFileName, imagePath);
}*/
}
总结:
word的2007版的目录转换为空,实现了的同学麻烦告知如何实现的,word的2003版的目录虽然转换成功了,但是第一行目录不对。