最经项目上有需求,需要将DOC文件中的内容,转换为HTML。以下是通过openOffice组件来实现的。在服务器上要安装openoffice 的客户端,并打开openoffice服务。
打开openoffice服务的命令:
cd C:\Program Files\OpenOffice.org 3\program
soffice -headless -accept="socket,host=127.0.0.1,port=8100;urp;" -nofirststartwizard
1、下载OpenOffice,http://download.openoffice.org/index.html So easy...
2、下载Jodconverter http://www.artofsolving.com/opensource/jodconverter 这是一个开启OpenOffice进行格式转化的第三方jar包。
package cn.com.jit.pki.rms.server.utils;
import org.apache.commons.fileupload.util.Streams;
import org.apache.log4j.Logger;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.ConnectException;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import cn.com.jit.pki.rms.server.error.RMSException;
import com.artofsolving.jodconverter.DocumentConverter;
import com.artofsolving.jodconverter.openoffice.connection.OpenOfficeConnection;
import com.artofsolving.jodconverter.openoffice.connection.SocketOpenOfficeConnection;
import com.artofsolving.jodconverter.openoffice.converter.OpenOfficeDocumentConverter;
public class Doc2Html {
/**
* Logger for this class
*/
private static final Logger logger = Logger.getLogger(Doc2Html.class);
public static File convert(File docFile, String filepath) throws RMSException {
if (logger.isDebugEnabled()) {
logger.debug("convert(File, String) - start");
}
// 创建保存html的文件
File htmlFile = new File(filepath + "/" + new Date().getTime()
+ ".html");
// 创建Openoffice连接
OpenOfficeConnection con = new SocketOpenOfficeConnection(8100);
try {
// 连接
con.connect();
} catch (ConnectException e) {
logger.error("convert(File, String)", e);
//System.out.println("获取OpenOffice连接失败...");
//e.printStackTrace();
throw new RMSException("11300039", e);
}
// 创建转换器
DocumentConverter converter = new OpenOfficeDocumentConverter(con);
// 转换文档问html
converter.convert(docFile, htmlFile);
// 关闭openoffice连接
con.disconnect();
if (logger.isDebugEnabled()) {
logger.debug("convert(File, String) - end");
}
return htmlFile;
}
/**
* 将word转换成html文件,并且获取html文件代码。
*
* @param docFile
* 需要转换的文档
* @param filepath
* 文档中图片的保存位置
* @param imgPath
* 上传的word文档中图片访问地址
* @return
* 转换成功的html代码
* @throws IOException
*
*/
public static String toHtmlString(File docFile, String filepath, String imgPath, String id) throws RMSException {
if (logger.isDebugEnabled()) {
logger.debug("toHtmlString(File, String, String) - start");
}
// 转换word文档
File htmlFile = convert(docFile, filepath);
// 获取html文件流
StringBuffer htmlSb = new StringBuffer();
try {
BufferedReader br = new BufferedReader(new InputStreamReader(
new FileInputStream(htmlFile), "gbk"));
while (br.ready()) {
//进行编码转换
byte[] temp = br.readLine().getBytes("gbk");
byte[] temp2 = new String(temp, "gbk").getBytes("utf-8");
htmlSb.append(new String(temp2, "utf-8"));
}
br.close();
//删除临时文件
htmlFile.delete();
} catch (FileNotFoundException e) {
logger.error("toHtmlString(File, String, String)", e);
throw new RMSException("11300038", e);
} catch (IOException e) {
logger.error("toHtmlString(File, String, String)", e);
throw new RMSException("11300038", e);
}
// HTML文件字符串
String htmlStr = htmlSb.toString();
// 返回经过清洁的html文本imgPath的值应该为http://...:8080/webmanager/newsImage/newsId/
htmlStr = clearFormat(htmlStr, imgPath);
//BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(htmlFile), "utf-8"));
//htmlStr = new String(htmlStr.getBytes("gbk"), "utf-8");
//out.write(htmlStr);
//ByteArrayInputStream bi = new ByteArrayInputStream(htmlStr.getBytes(charset));
//out.close();
if (logger.isDebugEnabled()) {
logger.debug("toHtmlString(File, String, String) - end");
}
return htmlStr;
}
/**
* 清除一些不需要的html标记
*
* @param htmlStr
* 带有复杂html标记的html语句
* @return 去除了不需要html标记的语句
*/
protected static String clearFormat(String htmlStr, String docImgPath) {
if (logger.isDebugEnabled()) {
logger.debug("clearFormat(String, String) - start");
}
// 获取body内容的正则
String bodyReg = "<BODY .*</BODY>";
Pattern bodyPattern = Pattern.compile(bodyReg);
Matcher bodyMatcher = bodyPattern.matcher(htmlStr);
if (bodyMatcher.find()) {
// 获取BODY内容,并转化BODY标签为DIV
htmlStr = bodyMatcher.group().replaceFirst("<BODY", "<DIV")
.replaceAll("</BODY>", "</DIV>");
}
// 调整图片地址
htmlStr = htmlStr.replaceAll("<IMG SRC=\"", "<IMG SRC=\"" + docImgPath
+ "/");
// 把<P></P>转换成</div></div>保留样式
// content = content.replaceAll("(<P)([^>]*>.*?)(<\\/P>)",
// "<div$2</div>");
// 把<P></P>转换成</div></div>并删除样式
htmlStr = htmlStr.replaceAll("(<P)([^>]*)(>.*?)(<\\/P>)", "<p$3</p>");
// 删除不需要的标签
htmlStr = htmlStr
.replaceAll(
"<[/]?(font|FONT|span|SPAN|xml|XML|del|DEL|ins|INS|meta|META|[ovwxpOVWXP]:\\w+)[^>]*?>",
"");
// 删除不需要的属性
htmlStr = htmlStr
.replaceAll(
"<([^>]*)(?:lang|LANG|class|CLASS|style|STYLE|size|SIZE|face|FACE|[ovwxpOVWXP]:\\w+)=(?:'[^']*'|\"\"[^\"\"]*\"\"|[^>]+)([^>]*)>",
"<$1$2>");
if (logger.isDebugEnabled()) {
logger.debug("clearFormat(String, String) - end");
}
return htmlStr;
}
}