word转html文本，完美保留图片以及图片名称自定义

最新推荐文章于 2023-09-02 21:10:39 发布
·小怪兽·
最新推荐文章于 2023-09-02 21:10:39 发布
阅读量672
点赞数 2
分类专栏：文件file 文章标签： java
本文链接：https://blog.csdn.net/qq_40689241/article/details/120890780
版权
文件file 专栏收录该内容
1 篇文章
订阅专栏
word转html文本，完美保留图片以及图片名称自定义

package com.zjyang.manage.util;

import lombok.extern.slf4j.Slf4j;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.BeansException;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.stereotype.Component;
import org.springframework.web.multipart.MultipartFile;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.math.BigDecimal;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.util.List;
import java.util.UUID;

/**
 * @author yangzhangjian
 * @description 读取word、PDF文档内容
 * @date 2021/10/13 17:15
 */
public class FileUtil implements ApplicationContextAware {

    /**
     * word文件读取
     * @param inputStream 文件输入流
     * @param type 文件类型（doc or docx）
     * @param imageUrl word里面的图片访问前缀
     * @param imageFilePath 图片保存地址
     * @return
     * @throws IOException
     * @throws ParserConfigurationException
     * @throws TransformerException
     */
    public static String readWord(InputStream inputStream, String type, String imageUrl, String imageFilePath) throws IOException, ParserConfigurationException, TransformerException {
        if ("doc".equals(type)) {
            HWPFDocument wordDocument = new HWPFDocument(inputStream);
            WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                    DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
            // 设置图片链接
            wordToHtmlConverter.setPicturesManager(new PicturesManager() {
                public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches,
                                          float heightInches) {
                    String uuid = (LocalDate.now() + "_" + UUID.randomUUID()).replaceAll("-", "");
                    String filename = uuid + suggestedName.substring(suggestedName.lastIndexOf("."));
                    File file = new File(imageFilePath + filename);
                    try (FileOutputStream os = new FileOutputStream(file)) {
                    	os.write(content);
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                    return imageUrl + filename;
                }
            });

            // 解析word文档
            wordToHtmlConverter.processDocument(wordDocument);
            Document htmlDocument = wordToHtmlConverter.getDocument();
            // 使用字符数组流获取解析的内容
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            DOMSource domSource = new DOMSource(htmlDocument);
            StreamResult streamResult = new StreamResult(baos);

            TransformerFactory factory = TransformerFactory.newInstance();
            Transformer serializer = factory.newTransformer();
            serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
            serializer.setOutputProperty(OutputKeys.INDENT, "yes");
            serializer.setOutputProperty(OutputKeys.METHOD, "html");
            serializer.transform(domSource, streamResult);
            String content = baos.toString();
            baos.close();
            return content;
        } else if ("docx".equals(type)) {
            XWPFDocument document = new XWPFDocument(inputStream);
            List<XWPFPictureData> list = document.getAllPictures();
            // 使用字符数组流获取解析的内容
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            XHTMLConverter.getInstance().convert(document, baos, null);
            String content = baos.toString("utf-8");
            // 图片保留
            content = setImg(content, list, imageFilePath, imageUrl);
            baos.close();
            return content;
        }
        return "";
    }

    /**
     * 设置图片地址
     *
     * @param html
     * @param list
     * @param imageFilePath
     * @param imageUrl
     * @return
     */
    private static String setImg(String html, List<XWPFPictureData> list, String imageFilePath, String imageUrl) {
        User user = RedisStaticContext.getUser();
        org.jsoup.nodes.Document doc = Jsoup.parse(html);
        Elements elements = doc.getElementsByTag("img");
        if (!elements.isEmpty() && list != null) {
            for (Element element : elements) {
                String src = element.attr("src");
                for (XWPFPictureData data : list) {
                    if (src.contains(data.getFileName())) {
                        // 生成图片
                        String uuid = (LocalDate.now() + "_" + UUID.randomUUID()).replaceAll("-", "");
                        String filename = uuid + src.substring(src.lastIndexOf("."));
                        bytesToFile(data.getData(), imageFilePath + filename);
                        String url = imageUrl + filename;
                        element.attr("src", url);
                        break;
                    }
                }
            }
        }

        return doc.toString();
    }

    /**
     * byte[]转文件
     *
     * @param bytes
     * @param fileName 文件需要存储的地址+文件名（绝对地址）
     */
    public static void bytesToFile(byte[] bytes, String fileName) {
        File file = new File(fileName);
        try (FileOutputStream fos = new FileOutputStream(file);
             BufferedOutputStream bos = new BufferedOutputStream(fos);) {
            bos.write(bytes);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}