word转html文本,完美保留图片以及图片名称自定义

word转html文本,完美保留图片以及图片名称自定义

package com.zjyang.manage.util;

import lombok.extern.slf4j.Slf4j;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.BeansException;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
import org.springframework.stereotype.Component;
import org.springframework.web.multipart.MultipartFile;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.math.BigDecimal;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.util.List;
import java.util.UUID;

/**
 * @author yangzhangjian
 * @description 读取word、PDF文档内容
 * @date 2021/10/13 17:15
 */
public class FileUtil implements ApplicationContextAware {

    /**
     * word文件读取
     * @param inputStream 文件输入流
     * @param type 文件类型(doc or docx)
     * @param imageUrl word里面的图片访问前缀
     * @param imageFilePath 图片保存地址
     * @return
     * @throws IOException
     * @throws ParserConfigurationException
     * @throws TransformerException
     */
    public static String readWord(InputStream inputStream, String type, String imageUrl, String imageFilePath) throws IOException, ParserConfigurationException, TransformerException {
        if ("doc".equals(type)) {
            HWPFDocument wordDocument = new HWPFDocument(inputStream);
            WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                    DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
            // 设置图片链接
            wordToHtmlConverter.setPicturesManager(new PicturesManager() {
                public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches,
                                          float heightInches) {
                    String uuid = (LocalDate.now() + "_" + UUID.randomUUID()).replaceAll("-", "");
                    String filename = uuid + suggestedName.substring(suggestedName.lastIndexOf("."));
                    File file = new File(imageFilePath + filename);
                    try (FileOutputStream os = new FileOutputStream(file)) {
                    	os.write(content);
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                    return imageUrl + filename;
                }
            });

            // 解析word文档
            wordToHtmlConverter.processDocument(wordDocument);
            Document htmlDocument = wordToHtmlConverter.getDocument();
            // 使用字符数组流获取解析的内容
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            DOMSource domSource = new DOMSource(htmlDocument);
            StreamResult streamResult = new StreamResult(baos);

            TransformerFactory factory = TransformerFactory.newInstance();
            Transformer serializer = factory.newTransformer();
            serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
            serializer.setOutputProperty(OutputKeys.INDENT, "yes");
            serializer.setOutputProperty(OutputKeys.METHOD, "html");
            serializer.transform(domSource, streamResult);
            String content = baos.toString();
            baos.close();
            return content;
        } else if ("docx".equals(type)) {
            XWPFDocument document = new XWPFDocument(inputStream);
            List<XWPFPictureData> list = document.getAllPictures();
            // 使用字符数组流获取解析的内容
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            XHTMLConverter.getInstance().convert(document, baos, null);
            String content = baos.toString("utf-8");
            // 图片保留
            content = setImg(content, list, imageFilePath, imageUrl);
            baos.close();
            return content;
        }
        return "";
    }

    /**
     * 设置图片地址
     *
     * @param html
     * @param list
     * @param imageFilePath
     * @param imageUrl
     * @return
     */
    private static String setImg(String html, List<XWPFPictureData> list, String imageFilePath, String imageUrl) {
        User user = RedisStaticContext.getUser();
        org.jsoup.nodes.Document doc = Jsoup.parse(html);
        Elements elements = doc.getElementsByTag("img");
        if (!elements.isEmpty() && list != null) {
            for (Element element : elements) {
                String src = element.attr("src");
                for (XWPFPictureData data : list) {
                    if (src.contains(data.getFileName())) {
                        // 生成图片
                        String uuid = (LocalDate.now() + "_" + UUID.randomUUID()).replaceAll("-", "");
                        String filename = uuid + src.substring(src.lastIndexOf("."));
                        bytesToFile(data.getData(), imageFilePath + filename);
                        String url = imageUrl + filename;
                        element.attr("src", url);
                        break;
                    }
                }
            }
        }

        return doc.toString();
    }

    /**
     * byte[]转文件
     *
     * @param bytes
     * @param fileName 文件需要存储的地址+文件名(绝对地址)
     */
    public static void bytesToFile(byte[] bytes, String fileName) {
        File file = new File(fileName);
        try (FileOutputStream fos = new FileOutputStream(file);
             BufferedOutputStream bos = new BufferedOutputStream(fos);) {
            bos.write(bytes);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}

  • 2
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
Java中可以通过Apache POI库来操作Word文档,通过读取Word文档中的内容,可以将其换为富文本格式并携带图片。 具体步骤如下: 1. 使用Apache POI读取Word文档,获取文档中的内容。 2. 使用Java的Rich Text Format (RTF)类库将Word文档内容换为RTF格式。 3. 在RTF文本中插入图片,可以使用图片的Base64编码将图片插入到文本中。 4. 将得到的RTF文本保存为文件或者直接将其返回给调用者。 下面是一个简单的示例代码,用于将Word文档换为RTF格式并插入图片: ```java import java.io.*; import org.apache.poi.xwpf.usermodel.*; import javax.imageio.ImageIO; import java.awt.image.BufferedImage; import javax.imageio.stream.ImageInputStream; import java.io.ByteArrayOutputStream; public class WordToRTFConverter { public static void main(String[] args) throws Exception { // 读取Word文档内容 XWPFDocument docx = new XWPFDocument(new FileInputStream("example.docx")); XWPFWordExtractor ex = new XWPFWordExtractor(docx); String text = ex.getText(); // 将Word文档内容换为RTF格式 RTFEditorKit rtf = new RTFEditorKit(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); rtf.write(baos, docx, 0, docx.getNumberOfParagraphs()); // 在RTF文本中插入图片 BufferedImage image = ImageIO.read(new File("example.png")); ByteArrayOutputStream imageOutput = new ByteArrayOutputStream(); ImageIO.write(image, "png", imageOutput); String imageData = new String(Base64.getEncoder().encode(imageOutput.toByteArray())); String rtfText = new String(baos.toByteArray(), "UTF-8"); rtfText = rtfText.replace("}", "{\\pict\\pngblip\\picw" + image.getWidth() + "\\pich" + image.getHeight() + "\\picwgoal" + (image.getWidth() * 15) + "\\pichgoal" + (image.getHeight() * 15) + "\n" + imageData + "}"); // 保存为RTF文件 FileOutputStream out = new FileOutputStream("example.rtf"); out.write(rtfText.getBytes()); out.close(); } } ``` 请注意,上述示例代码只是一个简单的示例,实际应用中可能会有更多的细节需要处理。例如,需要处理Word文档中的表格、段落样式等内容,以确保换结果符合预期。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值