POI处理Word转HTML，读取文档内容上传图片

最新推荐文章于 2023-03-22 17:14:04 发布

格林希尔

最新推荐文章于 2023-03-22 17:14:04 发布

阅读量1.2k

点赞数

分类专栏： Java实践

本文链接：https://blog.csdn.net/u010349629/article/details/107965270

版权

Java实践专栏收录该内容

75 篇文章 6 订阅

订阅专栏

POI处理Word转HTML，读取文档内容上传图片

引

需要给官网做一个文档管理的后台服务，需要一个上传Word文档转换成HTML格式返回给前端的功能，下面和大家一起来看看，后端部分的逻辑实现。

实现逻辑

需要用到的引用：

    compile('org.apache.poi:poi:4.1.0')
    compile('org.apache.poi:poi-ooxml:4.1.0')
    compile('org.apache.poi:poi-ooxml-schemas:4.1.0')
    compile('org.apache.poi:poi-scratchpad:4.1.0')
    compile('fr.opensagres.xdocreport:xdocreport:2.0.2')
    compile('org.apache.poi:ooxml-schemas:1.4')

doc转html实现

     /**
     * 上传Word文档，返回解析后的Html
     */
    public String uploadDocFile(MultipartFile file) throws Exception{
        String filePath = this.getClass().getClassLoader().getResource("file").getPath();
        String targetFileName = filePath +"/"+ "temp.html";
        File target = new File(targetFileName);
        target.getParentFile().mkdirs();
        //将上传的文件传入Document转换
        HWPFDocument wordDocument = new HWPFDocument(file.getInputStream());
        Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);
        // word文档转Html文档
        wordToHtmlConverter.processDocument(wordDocument);
        Document htmlDocument = wordToHtmlConverter.getDocument();
        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(new File(targetFileName));
        //将读取到的图片上传并添加链接地址
        wordToHtmlConverter.setPicturesManager((imageStream, pictureType, name, width, height) -> {
            try {
                String imageUrl = uploadImages(imageStream);
            } catch (Exception e) {
                e.printStackTrace();
            }
            return imageUrl;
        });
        //生成html文件
        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer serializer = tf.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(domSource, streamResult);
        // 读取并过滤文件格式
        String htmlContent = splitContext(targetFileName);
        // 删除生成的html文件
        File files = new File(targetFileName);
        files.delete();
        return htmlContent;
    }

    /**
     * 过滤html文件内容
     * @param filePath
     * @return
     */
    public static String splitContext(String filePath) {
        File file = new File(filePath);
        BufferedReader reader = null;
        try {
            String tempString;
            InputStreamReader isr = new InputStreamReader(new FileInputStream(file), "UTF-8");
            reader = new BufferedReader(isr);
            StringBuilder sb = new StringBuilder();
            while ((tempString = reader.readLine()) != null) {
                sb.append(tempString);
            }
            reader.close();
            // 将文件中的双引号替换为单引号
            String content = sb.toString().replaceAll("\"","\'");
            return content;
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e1) {
                }
            }
        }
        return "";
    }

docx转html实现

    /**
     * 上传docx文档，返回解析后的Html
     */
    public Stirng uploadDocXFile(MultipartFile file) throws Exception{
        // 将上传的文件传入Document转换
        XWPFDocument docxDocument = new XWPFDocument(file.getInputStream());
        XHTMLOptions options = XHTMLOptions.create();
        // 设置图片存储路径
        String path = System.getProperty("java.io.tmpdir");
        String firstImagePathStr = path + "/" + System.currentTimeMillis();
        options.setExtractor(new FileImageExtractor(new File(firstImagePathStr)));
        options.URIResolver(new BasicURIResolver(firstImagePathStr));
        // 转换html
        ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
        XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);
        String htmlStr = htmlStream.toString();
        // 将image文件转换为base64并替换到html字符串里
        String middleImageDirStr = "/word/media";
        String imageDirStr = firstImagePathStr + middleImageDirStr;
        File imageDir = new File(imageDirStr);
        String[] imageList = imageDir.list();
        String downloadPath;
        if (imageList != null) {
            for (int i = 0; i < imageList.length; i++) {
                String oneImagePathStr = imageDirStr + "/" + imageList[i];
                MultipartFile multipartFile = getMulFileByPath(oneImagePathStr);
                String imageUrl = uploadImages(multipartFile);
                // 也可以直接转成Base64格式处理，如下：
                // String imageBase64Str = new String(Base64.encodeBase64(FileUtils.readFileToByteArray(oneImageFile)), "UTF-8");
                //修改文档中的图片信息
                htmlStr = htmlStr.replace(oneImagePathStr, imageUrl);
            }
        }
        //删除图片路径
        File firstImagePath = new File(firstImagePathStr);
        FileUtils.deleteDirectory(firstImagePath);
        return  htmlStr;
    }

    /**
     * 获取MultipartFile文件
     * @param picPath
     * @return
     */
    private static MultipartFile getMulFileByPath(String picPath) {
        FileItem fileItem = createFileItem(picPath);
        MultipartFile mfile = new CommonsMultipartFile(fileItem);
        return mfile;
    }
    private static FileItem createFileItem(String filePath)
    {
        FileItemFactory factory = new DiskFileItemFactory(16, null);
        String textFieldName = "textField";
        int num = filePath.lastIndexOf(".");
        String extFile = filePath.substring(num);
        FileItem item = factory.createItem(textFieldName, "text/plain", true,
                "MyFileName" + extFile);
        File newfile = new File(filePath);
        int bytesRead = 0;
        byte[] buffer = new byte[8192];
        try
        {
            FileInputStream fis = new FileInputStream(newfile);
            OutputStream os = item.getOutputStream();
            while ((bytesRead = fis.read(buffer, 0, 8192))
                    != -1)
            {
                os.write(buffer, 0, bytesRead);
            }
            os.close();
            fis.close();
        }
        catch (IOException e)
        {
            e.printStackTrace();
        }
        return item;
    }

小结

实现主要依赖了POI包，将文件流传入document转换html，读取html文件中的图片进行上传处理，并将上传后的链接补充到文档中。

格林希尔

关注

0
点赞
踩
7

收藏

觉得还不错? 一键收藏
打赏
2
评论
POI处理Word转HTML，读取文档内容上传图片

POI处理Word转HTML，读取文档内容上传图片POI处理Word转HTML，读取文档内容上传图片新的改变POI处理Word转HTML，读取文档内容上传图片需要给官网做一个文档管理的后台服务，需要一个上传Word文档转换成HTML格式返回给前端的功能，下面和大家一起来看看，后端部分的逻辑实现。新的改变...
复制链接

扫一扫