使用POI操作Word最为方便,但是格式支持最不完善
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>4.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>ooxml-schemas</artifactId>
<version>1.4</version>
</dependency>
<dependency>
<groupId>fr.opensagres.xdocreport</groupId>
<artifactId>xdocreport</artifactId>
<version>2.0.2</version>
</dependency>
import java.io.*;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import fr.opensagres.poi.xwpf.converter.core.BasicURIResolver;
import fr.opensagres.poi.xwpf.converter.core.FileImageExtractor;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
public class WordUtils {
public static void main(String[] args) {
String filepath = "C:\\Users\\xxx\\Desktop\\xxx.doc";
String htmlFile = analysisWord(filepath);
System.out.println(htmlFile);
}
private static String analysisWord(String filepath){
String htmlpath = "";
File wordFile = new File(filepath);
if(wordFile.exists()){
if(filepath.endsWith(".doc")){
htmlpath = docToHtml(filepath);
}else if(filepath.endsWith(".docx")){
htmlpath = docxToHtml(filepath);
}else{
System.out.println("此文件不是word文件!");
}
}
return htmlpath;
}
private static String docxToHtml(String filepath) {
System.out.println("======word附件路径:filepath"+filepath);
String timemill = System.currentTimeMillis()+"";
String tempPath = new File(filepath).getParent()+File.separator+timemill+File.separator;
if(!new File(tempPath).exists()){
new File(tempPath).mkdirs();
}
String targetFileName = tempPath + File.separator + timemill+".html";
String imagePath = tempPath + File.separator + "image" + timemill + File.separator;
OutputStreamWriter outputStreamWriter = null;
try {
XWPFDocument document = new XWPFDocument(new FileInputStream(filepath));
XHTMLOptions options = XHTMLOptions.create();
options.setExtractor(new FileImageExtractor(new File(imagePath)));
options.URIResolver(new BasicURIResolver("image"+timemill));
outputStreamWriter = new OutputStreamWriter(new FileOutputStream(targetFileName), "utf-8");
XHTMLConverter xhtmlConverter = (XHTMLConverter) XHTMLConverter.getInstance();
xhtmlConverter.convert(document, outputStreamWriter, options);
}catch (IOException e){
targetFileName = "";
System.out.println("============docx文件解析出错!============");
e.printStackTrace();
} finally {
if (outputStreamWriter != null) {
try {
outputStreamWriter.close();
}catch (IOException e) { }
}
}
return targetFileName;
}
private static String docToHtml(String filepath){
System.out.println("======word附件路径:filepath"+filepath);
String timemill = System.currentTimeMillis()+"";
String tempPath = new File(filepath).getParent()+File.separator+timemill+File.separator;
if(!new File(tempPath).exists()){
new File(tempPath).mkdirs();
}
String targetFileName = tempPath + File.separator + timemill+".html";
String imagePathStr = tempPath + File.separator + "image" + timemill + File.separator;
if(!new File(imagePathStr).exists()) {
new File(imagePathStr).mkdirs();
}
try {
HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(filepath));
org.w3c.dom.Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);
wordToHtmlConverter.setPicturesManager(new PicturesManager() {
@Override
public String savePicture(byte[] content, PictureType pictureType, String name, float width, float height) {
try{
FileOutputStream out = new FileOutputStream(imagePathStr + name);
out.write(content);
} catch (Exception e) {
e.printStackTrace();
}
return "image" + timemill + File.separator + name;
}
});
wordToHtmlConverter.processDocument(wordDocument);
org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(new File(targetFileName));
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
}catch (Exception e) {
targetFileName = "";
System.out.println("============docx文件解析出错!============");
e.printStackTrace();
}
return targetFileName;
}
}