package jxfgw.util;
import org.apache.poi.hwpf.HWPFDocumentCore;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.converter.WordToHtmlUtils;
import org.apache.poi.hwpf.usermodel.Picture;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import com.sun.org.apache.xerces.internal.impl.dv.util.Base64;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.*;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.util.Properties;
public class DocToHtml{
public static void main(String[] args) throws ParserConfigurationException, TransformerException, IOException {
HWPFDocumentCore wordDocument = WordToHtmlUtils.loadDoc(new FileInputStream("D:\\1.doc"));
DocToHtml docToHtml = new DocToHtml();
docToHtml.docToHtml(wordDocument);
}
public String docToHtml(HWPFDocumentCore wordDocument) throws IOException, ParserConfigurationException, TransformerException {
WordToHtmlConverter wordToHtmlConverter = new ImageConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument()
);
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
ByteArrayOutputStream out = new ByteArrayOutputStream();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(out);
TransformerFactory transformerFactory = TransformerFactory.newInstance();
Transformer serializer = transformerFactory.newTransformer();
//判断操作系统是windows,还是Linux的SUSE系统
if(isWindows()){
serializer.setOutputProperty(OutputKeys.ENCODING, "gb2312");
}else{
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
}
// serializer.setOutputProperty(OutputKeys.ENCODING, "GB2312");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
out.close();
String result = new String(out.toByteArray());
return result;
// writeFile(new String(out.toByteArray()), "D://1.html");
}
public class ImageConverter extends WordToHtmlConverter{
public ImageConverter(Document document) {
super(document);
}
@Override
protected void processImageWithoutPicturesManager(Element currentBlock, boolean inlined, Picture picture){
Element imgNode = currentBlock.getOwnerDocument().createElement("img");
StringBuffer sb = new StringBuffer();
// java8用这个 Base64.getMimeEncoder().encodeToString(picture.getRawContent())
sb.append(Base64.encode(picture.getRawContent()));
sb.insert(0, "data:" + picture.getMimeType() + ";base64,");
imgNode.setAttribute("src", sb.toString());
currentBlock.appendChild(imgNode);
}
}
public static boolean isWindows(){
boolean flag = false;
if (System.getProperties().getProperty("os.name").toUpperCase().indexOf("WINDOWS") != -1) {
flag = true;
}
return flag;
}
public static void writeFile(String content, String path) {
FileOutputStream fos = null;
BufferedWriter bw = null;
try {
File file = new File(path);
fos = new FileOutputStream(file);
bw = new BufferedWriter(new OutputStreamWriter(fos,"GB2312"));
bw.write(content);
} catch (FileNotFoundException fnfe) {
fnfe.printStackTrace();
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
try {
if (bw != null)
bw.close();
if (fos != null)
fos.close();
} catch (IOException ie) {
}
}
}
}
需要用到的包poi3.9或者poi3.8等
乱码问题一般是window环境下用GB2316 linux下用utf-8编码,如此可解决乱码问题