openOffice 转换文件格式

最新推荐文章于 2024-03-22 14:41:29 发布

javaweiming

最新推荐文章于 2024-03-22 14:41:29 发布

阅读量5.6k

点赞数 2

分类专栏： java 文章标签： openoffice

本文链接：https://blog.csdn.net/javaweiming/article/details/16860579

版权

java 专栏收录该内容

53 篇文章 0 订阅

订阅专栏

引言：突然接到任务，要将word或者ppt转换成HTML的格式在页面上显示，类似于百度文库的效果。以前也听说过，觉得用java实现起来还是很简单的。于是我就带着我的任务以及我的好奇心出发了，在网上找了些资料，最终决定用OpenOffice。

首先简单的介绍下转换需要的环境：

1、转换组要安装openoffice软件（下载地址：http://download.openoffice.org/index.html）

2、需要下载jodconverter包

在此我提供了jodconverter包（包括jodconverter2、jodconverter3,注：jodconverter2需要手动启动openoffice服务，如有不清楚的地方可以在我文章下面留言），下载请点击...

JODConverter是一个开源文档转换工具，既可以应用于Linux平台，也可其应用于Windows平台。其基于OpenOffice.org或者LibreOffice。因此，文档转换服务器上必须安装有OpenOffice或者LibreOffice。

　　目前最新版本的JODConverter为JODConverter3.0，它要求JDK1.5以上的Java环境，同时还需要OpenOffice.org 3.x版本。本文基于最新版本3.0设计实现，如果是版本为2，则有不同的实现。（版本2需要手动启动OpenOffice.org服务，或者创建Windows服务设置为开机启动，而版本3提供了开启服务的接口，在此我使用的是版本3）

一切准备就绪那就直接开始了...

下面是一个比较完整的例子，可以实现 WORD==>HTML 、PPT==>HTML、WORD==>PDF、PPT==>PDF的转换。

package core;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.artofsolving.jodconverter.OfficeDocumentConverter;
import org.artofsolving.jodconverter.office.DefaultOfficeManagerConfiguration;
import org.artofsolving.jodconverter.office.OfficeManager;

//转换文档为pdf
public class OpenOfficePdfConvert {

	/**
	 * @param args
	 */
	private static OfficeManager officeManager;
	private static String OFFICE_HOME = "d:\\Program Files\\OpenOffice.org 3";// 安装OPenOffice
	// 的路径
	private static int port[] = { 8100 };

	//1、客户上传Word文档到服务器
	 
	//2、服务器调用OpenOffice程序打开上传的Word文档
	 
	//3、OpenOffice将Word文档另存为Html格式

	
	public File convertToHtml(String inputFile, String outputFile)
			throws FileNotFoundException {
		 // 创建保存html的文件
	    File wantFile = new File(outputFile + File.separator + new Date().getTime()
	        + ".html");
		// 开启服务器
		startService();
		// 进行转换
		System.out.println("进行文档转换转换:" + inputFile + " --> " + outputFile);
		OfficeDocumentConverter converter = new OfficeDocumentConverter(
				officeManager);
		converter.convert(new File(inputFile), wantFile);
		// 关闭服务器
		stopService();
		System.out.println();
		return wantFile;

	}

	// 打开服务器
	public static Boolean startService() {
		DefaultOfficeManagerConfiguration configuration = new DefaultOfficeManagerConfiguration();
		try {
			System.out.println("准备启动服务....");
			configuration.setOfficeHome(OFFICE_HOME);// 设置OpenOffice.org安装目录
			configuration.setPortNumbers(port); // 设置转换端口，默认为8100
			configuration.setTaskExecutionTimeout(1000 * 60 * 5L);// 设置任务执行超时为5分钟
			configuration.setTaskQueueTimeout(1000 * 60 * 60 * 24L);// 设置任务队列超时为24小时

			officeManager = configuration.buildOfficeManager();
			officeManager.start(); // 启动服务
			System.out.println("office转换服务启动成功!");
			return true;
		} catch (Exception ce) {
			System.out.println("office转换服务启动失败!详细信息:" + ce);
			return false;
		}
	}

	// 关闭服务器
	public static void stopService() {
		System.out.println("关闭office转换服务....");
		if (officeManager != null) {
			officeManager.stop();
		}
		System.out.println("关闭office转换成功!");
	}

	/*
	 * 进行测试转换是否成功
	 */
	public static void main(String[] args) {
		String inputFile = "c:\\test\\test.docx";
		String outputFile = "c:\\test";
		OpenOfficePdfConvert opc = new OpenOfficePdfConvert();
		try {
			opc.convertToHtml(inputFile,outputFile);
		} catch (FileNotFoundException e1) {
			e1.printStackTrace();
		}
		/*try {
			* 如果想看到不带HTML标签的字符串可以调用这个方法进行简化
   System.out.println(toHtmlString(inputFile, outputFile));} catch (FileNotFoundException e) {
			e.printStackTrace();
		}*/
		System.out.println("恭喜您，转换成功...");
	}

	/**
	 * 将word转换成html文件，并且获取html文件代码。
	 * 
	 * @param docFile
	 *            需要转换的文档
	 * @param filepath
	 *            文档中图片的保存位置
	 * @return 转换成功的html代码
	 * @throws FileNotFoundException
	 */
	public static String toHtmlString(String docFile, String filepath)
			throws FileNotFoundException {
		System.out.println("文档中图片的保存位置 ==>" + filepath);
		// 转换word文档
		OpenOfficePdfConvert opc = new OpenOfficePdfConvert();
		File htmlFile = opc.convertToHtml(docFile, filepath);
		// 获取html文件流
		StringBuffer htmlSb = new StringBuffer();
		try {
			BufferedReader br = new BufferedReader(new InputStreamReader(
					new FileInputStream(htmlFile)));
			while (br.ready()) {
				htmlSb.append(br.readLine());
			}
			br.close();
			// 删除临时文件
			// htmlFile.delete();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		// HTML文件字符串
		String htmlStr = htmlSb.toString();
		// 返回经过清洁的html文本
		return clearFormat(htmlStr, filepath);
	}

	/**
	 * 清除一些不需要的html标记
	 * 
	 * @param htmlStr  带有复杂html标记的html语句
	 *         
	 * @return 去除了不需要html标记的语句
	 */
	protected static String clearFormat(String htmlStr, String docImgPath) {
		// 获取body内容的正则
		String bodyReg = "<BODY .*</BODY>";
		Pattern bodyPattern = Pattern.compile(bodyReg);
		Matcher bodyMatcher = bodyPattern.matcher(htmlStr);
		if (bodyMatcher.find()) {
			// 获取BODY内容，并转化BODY标签为DIV
			htmlStr = bodyMatcher.group().replaceFirst("<BODY", "<DIV")
					.replaceAll("</BODY>", "</DIV>");
		}
		// 调整图片地址
		htmlStr = htmlStr.replaceAll("<IMG SRC=\"", "<IMG SRC=\"" + docImgPath
				+ "/");
		// 把<P></P>转换成</div></div>保留样式
		// content = content.replaceAll("(<P)([^>]*>.*?)(<\\/P>)",
		// "<div$2</div>");
		// 把<P></P>转换成</div></div>并删除样式
		htmlStr = htmlStr.replaceAll("(<P)([^>]*)(>.*?)(<\\/P>)", "<p$3</p>");
		// 删除不需要的标签
		htmlStr = htmlStr
				.replaceAll(
						"<[/]?(font|FONT|span|SPAN|xml|XML|del|DEL|ins|INS|meta|META|[ovwxpOVWXP]:\\w+)[^>]*?>",
						"");
		// 删除不需要的属性
		htmlStr = htmlStr
				.replaceAll(
						"<([^>]*)(?:lang|LANG|class|CLASS|style|STYLE|size|SIZE|face|FACE|[ovwxpOVWXP]:\\w+)=(?:'[^']*'|\"\"[^\"\"]*\"\"|[^>]+)([^>]*)>",
						"<$1$2>");
		return htmlStr;
	}
}

主要类说明：
OfficeManager是一个接口，主要定义了三个方法：
1.public void start( )启动OpenOffice服务
2.public void stop( )停止OpenOffice服务
3.public void execute(OfficeTask task)执行转换任务

　DefaultOfficeManagerConfiguration是一个实现了OfficeManager接口的实体类，其提供了相关方法配置OpenOffice.org，比如：

　public DefaultOfficeManagerConfiguration setOfficeHome(String officeHome)设置OpenOffice.org或者LibreOffice安装目录，
windows下默认值为” C:\Program Files\OpenOffice.org 3”（LibreOffice进行相应更改），因此如果OpenOffice.org安装在别的目录，必须设置此项。

　public DefaultOfficeManagerConfiguration setConnectionProtocol(OfficeConnectionProtocol conn)设置连接协议，确定使用管道通信，还是socekt通信。

　pubcli DefaultOfficeManagerConfiguration setTemplateProfileDir(File templateProfileDir)设定临时目录。

除以上几个方法之外，DefaultOfficeManagerConfiguration还提供了别的配置OpenOffice.org的方法，具体方法可以查询JODConverter API手册。
配置完之后，必须要执行方法buildOfficeManager()，实现真正的配置。

　OfficeDocumentConverter中主要包含convert方法，该方法实际上调用的是实现OfficeManager接口的类中的execute方法。

整体看起来还是比较简单的,但对自己也是一种提高，在此特别感谢肖恩也有梦想：http://www.cnblogs.com/luckyxiaoxuan/archive/2012/06/14/2549012.html

javaweiming

关注

2
点赞
踩
4

收藏

觉得还不错? 一键收藏
1
评论
openOffice 转换文件格式

引言：突然接到任务，要将word或者ppt转换成HTML的格式在页面上显示，类似于百度文库的效果。以前也听说过，觉得用java实现起来还是很简单的。于是我就带着我的任务以及我的好奇心出发了，在网上找了些资料，最终决定用OpenOffice。首先简单的介绍下转换需要的环境： 1、转换组要安装openoffice软件 2、需要下载jodconverter包一切准备就绪
复制链接

扫一扫