openOfficeConnection 在linux环境下word excel转html时，中文文件名无法识别问题_com.sun.star.lang.illegalargumentexception: url se-CSDN博客

本文链接：https://blog.csdn.net/u011697091/article/details/81780308

一，发现问题

:在windows环境中文的文件都可以识别并成功转换成html，但是到了linux环境，就一致报错，错误信息为Caused by: com.sun.star.lang.IllegalArgumentException: URL seems to be an unsupported one.。

二，定位问题

于是远程debug一步步调试，发现OpenOfficeDocumentConverter类里面有个方法convertInternal处理源文件路径用的，其中String inputUrl = fileContentProvider.getFileURLFromSystemPath("", inputFile.getAbsolutePath());获取的url就是文件路径，调试发现中文文件名转换后file:///home/tomcat/upload/uploadFile/00X2TSSX20183641/%E6%B5%8B%E8%AF%95%E6%A8%A1%E6%9D%BF00002.xls，也就是中文名被转转义了。最后调用
```
XComponentLoader desktop = this.openOfficeConnection.getDesktop();
return desktop.loadComponentFromURL(inputUrl, "_blank", 0, toPropertyValues(loadProperties));
```
这个路径在linux环境时无法找到文件的，但是windows下没问题。目前没找到原因时为什么
我尝试将文件名部转义，发现还是同样的问题，在windows下没问题，在linux下就不行。

三，解决问题

最后无赖就是将文件名，改为数字或者数字组成，只要不是中文，就可以，在linux环境下测试了一下，发现问题解决了。

四，最后也贴上我转换的代码

package com.webflowApp.common;

import com.artofsolving.jodconverter.BasicDocumentFormatRegistry;
import com.artofsolving.jodconverter.DefaultDocumentFormatRegistry;
import com.artofsolving.jodconverter.DocumentConverter;
import com.artofsolving.jodconverter.DocumentFormat;
import com.artofsolving.jodconverter.openoffice.connection.OpenOfficeConnection;
import com.artofsolving.jodconverter.openoffice.connection.SocketOpenOfficeConnection;
import com.artofsolving.jodconverter.openoffice.converter.OpenOfficeDocumentConverter;
import com.artofsolving.jodconverter.openoffice.converter.StreamOpenOfficeDocumentConverter;
import com.workflow.document.SystemDocumentFormatRegistry;
import net.sf.jasperreports.util.StringBufferWriter;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;
import java.net.ConnectException;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created by Administrator on 2018/8/14.
 */
public class OpenOfficeDoc {
    private static Logger logger= LoggerFactory.getLogger(OpenOfficeDoc.class);
    /**
     * 将word文档转换成html文档
     *
     * @param docFile  需要转换的word文档
     * @param filepath 转换之后html的存放路径
     * @return 转换之后的html文件
     */
    public static File convert(File docFile, String filepath) throws Exception {
        if(!docFile.exists()){
            throw new Exception("源文件不存在："+docFile.getPath());
        }
        // 创建保存html的文件
        String hz=".html";
        File htmlFile = new File(filepath + "/" + new Date().getTime()+ hz);
        // 连接openoffice服务
        OpenOfficeConnection connection = new SocketOpenOfficeConnection("127.0.0.1", 8100);
        // 连接
        connection.connect();
        if(!htmlFile.exists()){
            htmlFile.createNewFile(); //创建输出文件
        }
        // 创建转换器
        DocumentConverter converter = new OpenOfficeDocumentConverter(connection);
        converter.convert(docFile,htmlFile);
        connection.disconnect();
        return htmlFile;
    }

    public static String resolveCode(InputStream inputStream) throws Exception {
        byte[] head = new byte[3];
        inputStream.read(head);
        String code = "gb2312";  //或GBK
        if (head[0] == -1 && head[1] == -2 )
            code = "UTF-16";
        else if (head[0] == -2 && head[1] == -1 )
            code = "Unicode";
        else if(head[0]==-17 && head[1]==-69 && head[2] ==-65)
            code = "UTF-8";
        inputStream.close();
        return code;
    }

    /**
     * 将word转换成html文件，并且获取html文件代码。
     *
     * @param docFile  需要转换的文档
     * @param filepath 文档中图片的保存位置
     * @return 转换成功的html代码
     */
    public static String toHtmlString(final File docFile, final String filepath) {
        String htmlStr ="";
        try {
            if(docFile.getName().indexOf("txt")!=-1){
                // 获取html文件流
                StringBuffer htmlSb = new StringBuffer();
                String code=resolveCode(new FileInputStream(docFile));
                InputStreamReader isr = new InputStreamReader(new FileInputStream(docFile), code);
                BufferedReader br = new BufferedReader(isr);
                while (br.ready()) {
                    htmlSb.append(br.readLine());
                }
                br.close();
                // HTML文件字符串
                htmlStr = htmlSb.toString();
            }else{
                final File htmlFile = convert(docFile, filepath);
                // 获取html文件流
                StringBuffer htmlSb = new StringBuffer();
                Document parse = Jsoup.parse(htmlFile, "UTF-8");
                htmlSb.append(parse.html());
                //启动线程5秒后清理当前生成的文件
                final long startTime=System.currentTimeMillis();
                Thread thread=new Thread(new Runnable() {
                    @Override
                    public void run() {
                        while (true){
                            long endTime=System.currentTimeMillis();
                            if(((endTime - startTime) / 1000)>20){
                                File[] files = new File(filepath).listFiles();
                                for (int i = 0; i < files.length; i++) {
                                    File file = files[i];
                                    if(file.getName().indexOf(htmlFile.getName().substring(0,htmlFile.getName().indexOf(".")))!=-1){
                                        file.delete();
                                        System.out.println("已删除文件："+file.getName());
                                    }
                                }
                                break;
                            }
                        }
                    }
                });
                thread.start();
                // HTML文件字符串
                htmlStr = htmlSb.toString();
                // 返回经过清洁的html文本
                return htmlStr;
            }
        }catch (Exception e) {
            logger.error("文件转换异常！",e);
        }
        return htmlStr;
    }

    /**
     * 清除一些不需要的html标记
     *
     * @param htmlStr 带有复杂html标记的html语句
     * @return 去除了不需要html标记的语句
     */
    protected static String clearFormat(String htmlStr, String docImgPath) {
        // 获取body内容的正则
        String bodyReg = "<BODY .*</BODY>";
        Pattern bodyPattern = Pattern.compile(bodyReg);
        Matcher bodyMatcher = bodyPattern.matcher(htmlStr);
        if (bodyMatcher.find()) {
            // 获取BODY内容，并转化BODY标签为DIV
            htmlStr = bodyMatcher.group().replaceFirst("<BODY", "<DIV")
                    .replaceAll("</BODY>", "</DIV>");
        }
        // 调整图片地址
        htmlStr = htmlStr.replaceAll("<IMG SRC=\"", "<IMG SRC=\"" + docImgPath  + "/");
        htmlStr = htmlStr.replaceAll("(<P)([^>]*)(>.*?)(<\\/P>)", "<p$3</p>");
        // 删除不需要的标签
        htmlStr = htmlStr.replaceAll(
                        "<[/]?(font|FONT|span|SPAN|xml|XML|del|DEL|ins|INS|meta|META|[ovwxpOVWXP]:\\w+)[^>]*?>","");
        // 删除不需要的属性
        htmlStr = htmlStr.replaceAll("<([^>]*)(?:lang|LANG|class|CLASS|style|STYLE|size|SIZE|face|FACE|[ovwxpOVWXP]:\\w+)=(?:'[^']*'|\"\"[^\"\"]*\"\"|[^>]+)([^>]*)>","<$1$2>");
        return htmlStr;
    }
}