通过解析word反转成html文件再解析html文件的形式获取带有格式的word文本:
依赖包: http://download.csdn.net/detail/sxg0205/8959545
注意点:实测发现只能带有word的基本格式如左对齐、右对齐、换行等,插入的图片、横线等格式貌似无法获取,可能是写的不全的原因吧,后续再定位;
代码:
public void canExtractImage(String path, String name) throws IOException {
InputStream is = new FileInputStream("d:\\excel\\doc\\1.docx");
XWPFDocument document = new XWPFDocument(is);
XHTMLOptions options = XHTMLOptions.create();// .indent( 4 );
// Extract image
File imageFolder = new File("d:/excel/test.html");
options.setExtractor(new FileImageExtractor(imageFolder));
// URI resolver
options.URIResolver(new FileURIResolver(imageFolder));
OutputStream out = new FileOutputStream(new File("d:/excel/test.html"));
XHTMLConverter.getInstance().convert(document, out, options);
InputStream i1s = new FileInputStream("d:/excel/test.html");
java.io.BufferedReader in = null;
StringBuffer sb = new StringBuffer();
try {
in = new java.io.BufferedReader(new java.io.InputStreamReader(i1s));
for (String line; (line = in.readLine()) != null;) {
sb.append(line);
}
} finally {
if (in != null) {
in.close();
}
}
syso(sb.toString());
}