java在线预览doc

最新推荐文章于 2024-04-16 11:40:37 发布

qq_36954691

最新推荐文章于 2024-04-16 11:40:37 发布

阅读量101

点赞数

文章标签： java

本文链接：https://blog.csdn.net/qq_36954691/article/details/132410426

版权

今天遇到了一个很坑的问题，MinIo下载下来的doc文档使用java无法打开，尝试了Apache POI，Aspose.Words等工具，转换html或者转换pdf文件，一直报错文档已损坏，但是office打开之后随便修改一点再保存就没有问题。

最后实在没有办法，取了个巧，把下载下来的文档存到一个临时txt文档中，然后读取文档内容，去除多余字符，再进行截取，保留一段HTML代码传到页面显示。

try {
    // 创建一个HTTP客户端
    HttpClient client = HttpClientBuilder.create().build();
    // 发送GET请求
    HttpGet request = new HttpGet(downloadUrl);
    HttpResponse response = client.execute(request);
    // 从响应中获取实体内容
    HttpEntity entity = response.getEntity();
    if (entity != null) {
        // 将实体内容转换为字节数组
        byte[] fileData = EntityUtils.toByteArray(entity);
        // 创建临时文件
        File tempFile = File.createTempFile("temp-", ".txt");
        // 将字节数组写入临时文件
        try (FileOutputStream fos = new FileOutputStream(tempFile)) {
            fos.write(fileData);
        }

        // 加载TXT文本内容
        String txtContent = loadTextFromFile(tempFile.getAbsolutePath());
        // 删除不符合DOC内容规范的部分
        String cleanedContent = removeInvalidContent(txtContent);
        // 删除临时文件
        tempFile.delete();
        model.addAttribute("content", cleanedContent);
    }
} catch (IOException e) {
    e.printStackTrace();
} catch (Exception e) {
    e.printStackTrace();
}

private static String loadTextFromFile(String filePath) throws IOException {
    FileInputStream fis = new FileInputStream(filePath);
    //这里的编码看情况
    InputStreamReader isr = new InputStreamReader(fis, "GBK");
    BufferedReader reader = new BufferedReader(isr);
    StringBuilder content = new StringBuilder();
    String line;
    while ((line = reader.readLine()) != null) {
        // 处理每一行的文本内容
        content.append(line);
    }
    reader.close();
    isr.close();
    fis.close();
    return content.toString();
}

private static String removeInvalidContent(String text) {
    // 去除特殊字符
    text = text.replaceAll("^[^\\w\\s.,!?【】“”‘’'\"\\-*/+\\\\^()\\[\\]{}|$%<>@#&=:;~`\\u4e00-\\u9fa5]+$", "");

    int startIndex = text.indexOf("<BODY>") + "<BODY>".length();
    int endIndex = text.indexOf("</BODY>");
    if (startIndex != -1 && endIndex != -1) {
        return text.substring(startIndex, endIndex);
    }

    return text;
}