word文件不能上传到linux,openOfficeConnection 在linux环境下word excel转html时，中文文件名无法识别问题...-CSDN博客

package com.webflowApp.common;

import com.artofsolving.jodconverter.BasicDocumentFormatRegistry;

import com.artofsolving.jodconverter.DefaultDocumentFormatRegistry;

import com.artofsolving.jodconverter.DocumentConverter;

import com.artofsolving.jodconverter.DocumentFormat;

import com.artofsolving.jodconverter.openoffice.connection.OpenOfficeConnection;

import com.artofsolving.jodconverter.openoffice.connection.SocketOpenOfficeConnection;

import com.artofsolving.jodconverter.openoffice.converter.OpenOfficeDocumentConverter;

import com.artofsolving.jodconverter.openoffice.converter.StreamOpenOfficeDocumentConverter;

import com.workflow.document.SystemDocumentFormatRegistry;

import net.sf.jasperreports.util.StringBufferWriter;

import org.jsoup.Jsoup;

import org.jsoup.nodes.Document;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import java.io.*;

import java.net.ConnectException;

import java.util.Date;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

/**

* Created by Administrator on 2018/8/14.

public class OpenOfficeDoc {

private static Logger logger= LoggerFactory.getLogger(OpenOfficeDoc.class);

/**

* 将word文档转换成html文档

* @param docFile 需要转换的word文档

* @param filepath 转换之后html的存放路径

* @return 转换之后的html文件

public static File convert(File docFile, String filepath) throws Exception {

if(!docFile.exists()){

throw new Exception("源文件不存在："+docFile.getPath());

}

// 创建保存html的文件

String hz=".html";

File htmlFile = new File(filepath + "/" + new Date().getTime()+ hz);

// 连接openoffice服务

OpenOfficeConnection connection = new SocketOpenOfficeConnection("127.0.0.1", 8100);

// 连接

connection.connect();

if(!htmlFile.exists()){

htmlFile.createNewFile(); //创建输出文件

}

// 创建转换器

DocumentConverter converter = new OpenOfficeDocumentConverter(connection);

converter.convert(docFile,htmlFile);

connection.disconnect();

return htmlFile;

}

public static String resolveCode(InputStream inputStream) throws Exception {

byte[] head = new byte[3];

inputStream.read(head);

String code = "gb2312"; //或GBK

if (head[0] == -1 && head[1] == -2 )

code = "UTF-16";

else if (head[0] == -2 && head[1] == -1 )

code = "Unicode";

else if(head[0]==-17 && head[1]==-69 && head[2] ==-65)

code = "UTF-8";

inputStream.close();

return code;

}

/**

* 将word转换成html文件，并且获取html文件代码。

* @param docFile 需要转换的文档

* @param filepath 文档中图片的保存位置

* @return 转换成功的html代码

public static String toHtmlString(final File docFile, final String filepath) {

String htmlStr ="";

try {

if(docFile.getName().indexOf("txt")!=-1){

// 获取html文件流

StringBuffer htmlSb = new StringBuffer();

String code=resolveCode(new FileInputStream(docFile));

InputStreamReader isr = new InputStreamReader(new FileInputStream(docFile), code);

BufferedReader br = new BufferedReader(isr);

while (br.ready()) {

htmlSb.append(br.readLine());

}

br.close();

// HTML文件字符串

htmlStr = htmlSb.toString();

}else{

final File htmlFile = convert(docFile, filepath);

// 获取html文件流

StringBuffer htmlSb = new StringBuffer();

Document parse = Jsoup.parse(htmlFile, "UTF-8");

htmlSb.append(parse.html());

//启动线程5秒后清理当前生成的文件

final long startTime=System.currentTimeMillis();

Thread thread=new Thread(new Runnable() {

@Override

public void run() {

while (true){

long endTime=System.currentTimeMillis();

if(((endTime - startTime) / 1000)>20){

File[] files = new File(filepath).listFiles();

for (int i = 0; i < files.length; i++) {

File file = files[i];

if(file.getName().indexOf(htmlFile.getName().substring(0,htmlFile.getName().indexOf(".")))!=-1){

file.delete();

System.out.println("已删除文件："+file.getName());

}

break;

}

});

thread.start();

// HTML文件字符串

htmlStr = htmlSb.toString();

// 返回经过清洁的html文本

return htmlStr;

}

}catch (Exception e) {

logger.error("文件转换异常！",e);

}

return htmlStr;

}

/**

* 清除一些不需要的html标记

* @param htmlStr 带有复杂html标记的html语句

* @return 去除了不需要html标记的语句

protected static String clearFormat(String htmlStr, String docImgPath) {

// 获取body内容的正则

String bodyReg = "

Pattern bodyPattern = Pattern.compile(bodyReg);

Matcher bodyMatcher = bodyPattern.matcher(htmlStr);

if (bodyMatcher.find()) {

// 获取BODY内容，并转化BODY标签为DIV

htmlStr = bodyMatcher.group().replaceFirst("

.replaceAll("", "

");