java将word转为html，图片替换为base64代码

最新推荐文章于 2024-03-18 15:40:43 发布

猿猴一号(lxd)

最新推荐文章于 2024-03-18 15:40:43 发布

阅读量1.1k

点赞数

分类专栏： JAVA 文章标签： java html 前端 java-ee spring boot

本文链接：https://blog.csdn.net/weixin_42123075/article/details/125994994

版权

JAVA 专栏收录该内容

33 篇文章 3 订阅

订阅专栏

使用spric.doc导入docx格式的word，并转换为html代码。
转换后会生成3个文件：xxx.html,xxx.css,以及存放word中图片的文件夹，需求是要求将样式以及图片一起放到xxx.html代码中，图片采用base64替换标签的src路径。
附官网链接: Spire.Doc for Java

maven依赖

<!-- https://mvnrepository.com/artifact/e-iceblue/spire.office -->
<dependency>
    <groupId>e-iceblue</groupId>
    <artifactId>spire.office.free</artifactId>
    <version>5.3.1</version>
</dependency>

方法

/**
* java将word转为html，图片替换为base64代码
*/
public String importConclusion(String filePath, HttpServletResponse response){
        File importFile = new File(filePath);
        try {
            Document doc = new Document();
            doc.setKeepSameFormat(true);
            doc.loadFromFile(importFile.getPath());
            // 获取所有图片
            List<BufferedImage> images = new ArrayList();
            for (int i=0; i<doc.getSections().getCount(); i++){
                int paragraphsCount = doc.getSections().get(i).getParagraphs().getCount();
                // 遍历段落
                for (int j = 0; j < paragraphsCount; j++){
                    Paragraph paragraph = doc.getSections().get(i).getParagraphs().get(j);
                    // 遍历段落中的子对象
                    for (int z = 0; z < paragraph.getChildObjects().getCount(); z++) {
                        Object obj = paragraph.getChildObjects().get(z);
                        // 判定是否为图片
                        if(obj instanceof DocPicture){
                            DocPicture picture = (DocPicture) obj;
                            images.add(picture.getImage());
                        }
                    }
                }
            }
            // 生成的html等文件存放地址
            File dirFile = new File(docxFilePath);
            if(dirFile.exists()){
                dirFile.mkdirs();
            }
            String uuid = UUID.randomUUID().toString();
            doc.saveToFile(docxFilePath  + uuid +  "docx.html", FileFormat.Html);
            doc.dispose();
            // 读取CSS样式，拼接到html中
            String cssStyle = readCss(docxFilePath  + uuid +  "docx_styles.css");
            cssStyle = "<style>" + cssStyle + "</style>";
            // 读取html内容
            String htmlStr = readCss(docxFilePath  + uuid +  "docx.html");
            htmlStr = htmlStr.replace("<link href=\""+uuid+"docx_styles.css\" type=\"text/css\" rel=\"stylesheet\"/>",cssStyle);
            // 获取html中所有img标签，并替换为base64
            List<String> imgList = new ArrayList<>();
            Pattern imgPattern = Pattern.compile("<(img|IMG)(.*?)(/>|></img>|>)");
            Matcher imgMatcher = imgPattern.matcher(htmlStr);
            boolean isFound = imgMatcher.find();
            if(isFound){
                while (isFound){
                    String imgContent = imgMatcher.group(2);
                    Pattern srcPattern = Pattern.compile("(src|SRC)=(\"|\')(.*?)(\"|\')");
                    Matcher srcMatcher = srcPattern.matcher(imgContent);
                    if(srcMatcher.find()){
                        String srcAddr = srcMatcher.group(3);
                        imgList.add(srcAddr);
                    }
                    isFound = imgMatcher.find();
                }
            }
            if(images.size() != 0 && images.size() == imgList.size()){
                for(int i = 0;i<images.size();i++){
                    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
                    ImageIO.write(images.get(i),"png",outputStream);
                    String base64 = "data:image/" + "png" +";base64  ," + Base64.getEncoder().encodeToString(outputStream.toByteArray());
                    htmlStr = htmlStr.replace(imgList.get(i),base64);
                }
            }
            // 删除生成的html文件
            deleteAllFiles(docxFilePath);
            return htmlStr;
        } catch (Exception e) {
            e.printStackTrace();
        }

        return "";
    }

读取文件内容

/**
     * 读取文件内容
     * @param filePath
     * @return
     * @throws IOException
     */
    public String readCss(String filePath) throws IOException {
        File file = new File(filePath);
        FileInputStream in = null;
        String str = "";
        try {
            in = new FileInputStream(file);
            byte[] b = new byte[in.available()];
            in.read(b);
            str = new String(b);
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            in.close();
        }
        return str;
    }

删除文件夹及文件夹下的所有内容

/**
     * 删除文件夹及文件夹下的所有内容
     * @param dir
     * @return
     */
    public static boolean deleteAllFiles(String dir){
        File dirFile = new File(dir);
        if(!dirFile.exists() || !dirFile.isDirectory()){
            return false;
        }
        boolean flag = true;
        File[] files = dirFile.listFiles();
        for(int i = 0; i<files.length;i++){
            // 删除子文件
            if(files[i].isFile()){
                flag = files[i].delete();
            }else if(files[i].isDirectory()){
                // 删除子文件夹
                flag = deleteAllFiles(files[i].getAbsolutePath());
            }
        }
        // 删除该文件夹
        dirFile.delete();
        return flag;
    }

猿猴一号(lxd)

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
java将word转为html，图片替换为base64代码

使用spric.doc导入docx格式的word，并转换为html代码。转换后会生成3个文件xxx.html,xxx.css,以及存放word中图片的文件夹，需求是要求将样式以及图片一起放到xxx.html代码中，图片采用base64替换标签的src路径。...
复制链接

扫一扫

专栏目录