OpenOffice【DOC2HTML】

最新推荐文章于 2025-02-28 04:27:29 发布

slikel

最新推荐文章于 2025-02-28 04:27:29 发布

阅读量268

点赞数

分类专栏： JAVA组件文章标签： java

本文链接：https://blog.csdn.net/slikel/article/details/84288874

版权

JAVA组件专栏收录该内容

1 篇文章

订阅专栏

本文介绍了一种使用OpenOffice和JOD Converter将DOC文件转换为HTML的方法。首先安装所需软件和服务，然后通过Java代码实现文件转换及格式清理。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

最经项目上有需求，需要将DOC文件中的内容，转换为HTML。以下是通过openOffice组件来实现的。在服务器上要安装openoffice 的客户端，并打开openoffice服务。

打开openoffice服务的命令：

cd C:\Program Files\OpenOffice.org 3\program

soffice -headless -accept="socket,host=127.0.0.1,port=8100;urp;" -nofirststartwizard

1、下载OpenOffice，http://download.openoffice.org/index.html So easy...

2、下载Jodconverter http://www.artofsolving.com/opensource/jodconverter 这是一个开启OpenOffice进行格式转化的第三方jar包。

package cn.com.jit.pki.rms.server.utils;

import org.apache.commons.fileupload.util.Streams;
import org.apache.log4j.Logger;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.ConnectException;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import cn.com.jit.pki.rms.server.error.RMSException;

import com.artofsolving.jodconverter.DocumentConverter;
import com.artofsolving.jodconverter.openoffice.connection.OpenOfficeConnection;
import com.artofsolving.jodconverter.openoffice.connection.SocketOpenOfficeConnection;
import com.artofsolving.jodconverter.openoffice.converter.OpenOfficeDocumentConverter;

public class Doc2Html {
	/**
	 * Logger for this class
	 */
	private static final Logger logger = Logger.getLogger(Doc2Html.class);
	
	public static File convert(File docFile, String filepath) throws RMSException {
		if (logger.isDebugEnabled()) {
			logger.debug("convert(File, String) - start");
		}

		// 创建保存html的文件
		File htmlFile = new File(filepath + "/" + new Date().getTime()
				+ ".html");
		// 创建Openoffice连接
		OpenOfficeConnection con = new SocketOpenOfficeConnection(8100);
		try {
			// 连接
			con.connect();
		} catch (ConnectException e) {
			logger.error("convert(File, String)", e);

			//System.out.println("获取OpenOffice连接失败...");
			//e.printStackTrace();
			throw new RMSException("11300039", e);
		}
		// 创建转换器
		DocumentConverter converter = new OpenOfficeDocumentConverter(con);
		// 转换文档问html
		converter.convert(docFile, htmlFile);
		// 关闭openoffice连接
		con.disconnect();

		if (logger.isDebugEnabled()) {
			logger.debug("convert(File, String) - end");
		}
		return htmlFile;
	}

	/**
	 * 将word转换成html文件，并且获取html文件代码。
	 * 
	 * @param docFile
	 *            需要转换的文档
	 * @param filepath
	 *            文档中图片的保存位置
	 * @param imgPath
	 * 			    上传的word文档中图片访问地址
	 * @return 
	 * 			转换成功的html代码
	 * @throws IOException 
	 * 
	 */
	public static String toHtmlString(File docFile, String filepath, String imgPath, String id) throws RMSException {
		if (logger.isDebugEnabled()) {
			logger.debug("toHtmlString(File, String, String) - start");
		}

		// 转换word文档
		File htmlFile = convert(docFile, filepath);
		// 获取html文件流
		StringBuffer htmlSb = new StringBuffer();
		try {
			BufferedReader br = new BufferedReader(new InputStreamReader(
					new FileInputStream(htmlFile), "gbk"));
			while (br.ready()) {
				//进行编码转换
				byte[] temp = br.readLine().getBytes("gbk");
				byte[] temp2 = new String(temp, "gbk").getBytes("utf-8");
				htmlSb.append(new String(temp2, "utf-8"));
			}
			br.close();
			//删除临时文件
			htmlFile.delete();
			
		} catch (FileNotFoundException e) {
			logger.error("toHtmlString(File, String, String)", e);

			throw new RMSException("11300038", e);
		} catch (IOException e) {
			logger.error("toHtmlString(File, String, String)", e);

			throw new RMSException("11300038", e);
		}
		// HTML文件字符串
		String htmlStr = htmlSb.toString();
		// 返回经过清洁的html文本imgPath的值应该为http://...:8080/webmanager/newsImage/newsId/
		htmlStr = clearFormat(htmlStr, imgPath);
		//BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(htmlFile), "utf-8"));
		//htmlStr = new String(htmlStr.getBytes("gbk"), "utf-8");
		//out.write(htmlStr);
		//ByteArrayInputStream bi = new ByteArrayInputStream(htmlStr.getBytes(charset));
		//out.close();
		if (logger.isDebugEnabled()) {
			logger.debug("toHtmlString(File, String, String) - end");
		}
		return htmlStr;
		
	}

	/**
	 * 清除一些不需要的html标记
	 * 
	 * @param htmlStr
	 *            带有复杂html标记的html语句
	 * @return 去除了不需要html标记的语句
	 */
	protected static String clearFormat(String htmlStr, String docImgPath) {
		if (logger.isDebugEnabled()) {
			logger.debug("clearFormat(String, String) - start");
		}

		// 获取body内容的正则
		String bodyReg = "<BODY .*</BODY>";
		Pattern bodyPattern = Pattern.compile(bodyReg);
		Matcher bodyMatcher = bodyPattern.matcher(htmlStr);
		if (bodyMatcher.find()) {
			// 获取BODY内容，并转化BODY标签为DIV
			htmlStr = bodyMatcher.group().replaceFirst("<BODY", "<DIV")
					.replaceAll("</BODY>", "</DIV>");
		}
		// 调整图片地址
		htmlStr = htmlStr.replaceAll("<IMG SRC=\"", "<IMG SRC=\"" + docImgPath
				+ "/");
		// 把<P></P>转换成</div></div>保留样式
		// content = content.replaceAll("(<P)([^>]*>.*?)(<\\/P>)",
		// "<div$2</div>");
		// 把<P></P>转换成</div></div>并删除样式
		htmlStr = htmlStr.replaceAll("(<P)([^>]*)(>.*?)(<\\/P>)", "<p$3</p>");
		// 删除不需要的标签
		htmlStr = htmlStr
				.replaceAll(
						"<[/]?(font|FONT|span|SPAN|xml|XML|del|DEL|ins|INS|meta|META|[ovwxpOVWXP]:\\w+)[^>]*?>",
						"");
		// 删除不需要的属性
		htmlStr = htmlStr
				.replaceAll(
						"<([^>]*)(?:lang|LANG|class|CLASS|style|STYLE|size|SIZE|face|FACE|[ovwxpOVWXP]:\\w+)=(?:'[^']*'|\"\"[^\"\"]*\"\"|[^>]+)([^>]*)>",
						"<$1$2>");

		if (logger.isDebugEnabled()) {
			logger.debug("clearFormat(String, String) - end");
		}
		return htmlStr;
	}

}