OpenOffic的格式识别比POI好,比Jacob差。但是可以多平台运行。使用过程比较麻烦
- 下载OpenOffice,并安装
- 下载Jodconverter
- 启动openOffice
在cmd命令行,进入到OpenOffice安装目录下的program
文件夹下运行
soffice -headless -accept="socket,port=8100;urp;"
部分电脑需要输入以下内容
soffice -headless -accept="socket,host=127.0.0.1,port=8100;urp;" -nofirststartwizard
- 引入
Jodconverter
压缩包中lib
文件夹下所有的jar包 - 代码
import com.artofsolving.jodconverter.DocumentConverter;
import com.artofsolving.jodconverter.openoffice.connection.OpenOfficeConnection;
import com.artofsolving.jodconverter.openoffice.connection.SocketOpenOfficeConnection;
import com.artofsolving.jodconverter.openoffice.converter.OpenOfficeDocumentConverter;
import java.io.*;
import java.net.ConnectException;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class OpenOffice {
public static void main(String[] args) throws Exception {
System.out
.println(toHtmlString(new File("C:\\Users\\xxx\\Desktop\\aaa.docx"), "D:/test"));
}
public static File convert(File docFile, String filepath) {
File htmlFile = new File(filepath + "/" + new Date().getTime() + ".html");
OpenOfficeConnection con = new SocketOpenOfficeConnection(8100);
try {
con.connect();
} catch (ConnectException e) {
System.out.println("获取OpenOffice连接失败...");
e.printStackTrace();
}
DocumentConverter converter = new OpenOfficeDocumentConverter(con);
converter.convert(docFile, htmlFile);
con.disconnect();
return htmlFile;
}
public static String toHtmlString(File docFile, String filepath) throws Exception {
File htmlFile = convert(docFile, filepath);
String charSet = "UTF-8";
InputStream in= new java.io.FileInputStream(htmlFile);
byte[] b = new byte[3];
in.read(b);
in.close();
if (b[0] == -17 && b[1] == -69 && b[2] == -65)
charSet = "UTF-8";
else
charSet = "GBK";
StringBuffer htmlSb = new StringBuffer();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(htmlFile),charSet));
while (br.ready()) {
htmlSb.append(br.readLine());
}
br.close();
} catch (Exception e) {
e.printStackTrace();
}
String htmlStr = htmlSb.toString();
return clearFormat(htmlStr, filepath);
}
protected static String clearFormat(String htmlStr, String docImgPath) {
String bodyReg = "<BODY .*</BODY>";
Pattern bodyPattern = Pattern.compile(bodyReg);
Matcher bodyMatcher = bodyPattern.matcher(htmlStr);
if (bodyMatcher.find()) {
htmlStr = bodyMatcher.group().replaceFirst("<BODY", "<DIV")
.replaceAll("</BODY>", "</DIV>");
}
htmlStr = htmlStr.replaceAll("<IMG SRC=\"", "<IMG SRC=\"" + docImgPath+ "/");
htmlStr = htmlStr.replaceAll("(<P)([^>]*)(>.*?)(<\\/P>)", "<p$3</p>");
htmlStr = htmlStr
.replaceAll(
"<[/]?(font|FONT|span|SPAN|xml|XML|del|DEL|ins|INS|meta|META|[ovwxpOVWXP]:\\w+)[^>]*?>",
"");
htmlStr = htmlStr
.replaceAll(
"<([^>]*)(?:lang|LANG|class|CLASS|style|STYLE|size|SIZE|face|FACE|[ovwxpOVWXP]:\\w+)=(?:'[^']*'|\"\"[^\"\"]*\"\"|[^>]+)([^>]*)>",
"<$1$2>");
return htmlStr;
}
}