使用poi将word转为html

最新推荐文章于 2024-04-25 20:22:26 发布

久夏

最新推荐文章于 2024-04-25 20:22:26 发布

阅读量3k

点赞数

分类专栏： poi 文章标签： poi html word

poi 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

使用poi将word转为html

需求：将上传的word文档转为html并返回页面填充到富文本编辑器中
使用方法：
1.openoffice出现问题：图片错位;
2.poi将word转为html；图片格式多样，如wmf，emf等文件格式不能在页面上显示;
3.上传文件限定为docx，更改后缀为zip，解压可得到所有图片并且格式为png，但html需使用其他方法获得再修改img标签的图片路径;
本文使用poi将word转为html，图片格式问题之后发文解决。

使用Maven导入jar包

<dependency>
  <groupId>org.apache.poi</groupId>
  <artifactId>poi</artifactId>
  <version>3.14</version>
</dependency>
<dependency>
  <groupId>org.apache.poi</groupId>
  <artifactId>poi-scratchpad</artifactId>
  <version>3.14</version>
</dependency>
<dependency>
  <groupId>org.apache.poi</groupId>
  <artifactId>poi-ooxml</artifactId>
  <version>3.14</version>
</dependency>
<dependency>
  <groupId>fr.opensagres.xdocreport</groupId>
  <artifactId>xdocreport</artifactId>
  <version>1.0.6</version>
</dependency>
<dependency>
  <groupId>org.apache.poi</groupId>
  <artifactId>poi-ooxml-schemas</artifactId>
  <version>3.14</version>
</dependency>
<dependency>
  <groupId>org.apache.poi</groupId>
  <artifactId>ooxml-schemas</artifactId>
  <version>1.3</version>
</dependency>

PoiUtil.java

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.util.*;

/**
 * Created by will on 2017/6/9.
 * 使用poi将word转为html文件，并从文件中读取内容
 */
public class PoiUtil {
    // 在html中图片保存的相对路径
    private static String imagePath;

    /**
    * @param source word文件的File对象
    * @param sourceFileName word文件名
    * @param savePath 图片保存路径
    * @return 转成的html字符串
    */
    public static String getHtml(File source, String sourceFileName, String savePath) throws Exception {
        imagePath = "/upload/" + sourceFileName.substring(0, sourceFileName.lastIndexOf("."));
        String imagePathStr = savePath + File.separator + sourceFileName.substring(0, sourceFileName.lastIndexOf(".")) + File.separator;
        String content;
        String imgEnd = "";
        // 判断word文档类型，使用不同方法进行转换
        if (sourceFileName.endsWith(".doc")) {
            content = docToStr(source, sourceFileName, imagePathStr);
        } else if (sourceFileName.endsWith(".docx")) {
            content = docxToStr(source, sourceFileName, imagePathStr);
            // 转换docx文件得到的图片路径
            imgEnd = "word/media/";
        } else {
            return "文件类型错误";
        }
        // 利用正则表达式过滤无用标签和属性
        content = RegexAnswerUtil.clear(content)；
        return content;
    }

    // doc转换为html
    public static String docToStr(File source, String sourceFileName, String imagePathStr) throws Exception {
        String targetFileName = imagePathStr + sourceFileName.substring(0, sourceFileName.lastIndexOf(".")) + ".html";
        File target = new File(targetFileName);
        target.getParentFile().mkdirs();
        HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(source));
        Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);
        // 保存图片，并返回图片的相对路径
        wordToHtmlConverter.setPicturesManager((content, pictureType, name, width, height) -> {
            try (FileOutputStream out = new FileOutputStream(new File(imagePathStr + name))) {
                out.write(content);
            } catch (Exception e) {
                e.printStackTrace();
            }
            return imagePath +"/" + name;
        });
        wordToHtmlConverter.processDocument(wordDocument);
        Document htmlDocument = wordToHtmlConverter.getDocument();
        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(new File(targetFileName));
        TransformerFactory tf = TransformerFactory.newInstance();
        Transformer serializer = tf.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(domSource, streamResult);
        String content = splitContext(targetFileName);
        // 删除生成的html文件
        File file = new File(targetFileName);
        file.delete();
        return content;
    }

    // docx转换为html
    public static String docxToStr(File source, String sourceFileName, String imagePathStr) throws Exception {
        String targetFileName = imagePathStr + sourceFileName.substring(0, sourceFileName.lastIndexOf(".")) + ".html";
        File target = new File(targetFileName);
        target.getParentFile().mkdirs();
        OutputStreamWriter outputStreamWriter = null;
        try {
            XWPFDocument document = new XWPFDocument(new FileInputStream(source));
            XHTMLOptions options = XHTMLOptions.create();
            // 存放图片的文件夹
            options.setExtractor(new FileImageExtractor(new File(imagePathStr)));
            // html中图片的路径
            options.URIResolver(new BasicURIResolver(imagePath));
            outputStreamWriter = new OutputStreamWriter(new FileOutputStream(target), "utf-8");
            XHTMLConverter xhtmlConverter = (XHTMLConverter) XHTMLConverter.getInstance();
            xhtmlConverter.convert(document, outputStreamWriter, options);
        } finally {
            if (outputStreamWriter != null) {
                outputStreamWriter.close();
            }
        }
        String content = splitContext(targetFileName);
        // 删除生成的html文件
        File file = new File(targetFileName);
        file.delete();
        return content;
    }

    /**
    * docx文件转html会生成html编码
    * 该方法能转换大部分
    * 富文本编辑器中可以不做处理
    */
    public static String htmlEncoding(String html) {
        String regExp = "&#\\d*;";
        Matcher m = Pattern.compile(regExp).matcher(html);
        StringBuffer sb = new StringBuffer();
        if (!m.find()) {
            sb.append(html);
        } else {
            while (m.find()) {
                String s = m.group(0);
                s = s.replaceAll("(&#)|;", "");
                char c = (char) Integer.parseInt(s);
                m.appendReplacement(sb, Character.toString(c));
            }
        }
        return sb.toString();
    }

    /**
    * 读取转换得到的html文件，并过滤多余空行
    */
    public static String splitContext(String filePath) {
        File file = new File(filePath);
        BufferedReader reader = null;
        try {
            InputStreamReader isr = new InputStreamReader(new FileInputStream(file), "UTF-8");
            reader = new BufferedReader(isr);
            StringBuilder sb = new StringBuilder();
            String tempString = null;
            // 一次读入一行，直到读入null为文件结束
            while ((tempString = reader.readLine()) != null) {
                sb.append(tempString);
                if(!tempString.equals("")){
                    sb.append("\n");
                }
            }
            reader.close();
            String content = sb.toString().replaceAll("\\n+", "\n");
            return content;
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e1) {
                }
            }
        }
        return "";
    }
}

RegexAnswerUtil.java

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created by will on 2017/6/9.
 * 清除无用的标签和属性
 */
public class RegexAnswerUtil {

    /**
    * @param returnString html字符串
    * @return 过滤后的html字符串
    */
    public static String clear(String returnString){
        int start = returnString.indexOf("<body")==-1?0:returnString.indexOf(">", returnString.indexOf("<body"))+1;
        int end = returnString.indexOf("</body>")==-1?returnString.length():returnString.indexOf("</body>");
        returnString = returnString.substring(start, end);
        Pattern pattern = Pattern.compile(
                "(<\\w+\\s*[^>]+?>)",
                Pattern.CASE_INSENSITIVE);
        Matcher matcher = pattern.matcher(returnString);
        while (matcher.find()) {
            String group = matcher.group();
            if (group == null) {
                continue;
            }
            String sub = matcher.group();
            String imageRegex = "<img.*?(src[=]\"[^\"]+\")[^>]+?>";
            returnString = returnString.replaceAll(imageRegex, "<img $1/>");
            String otherRegex = "<(?!img)(\\w+)\\s[^>]+>";
            Pattern sub_p = Pattern.compile(otherRegex);
            Matcher m_html = sub_p.matcher(sub);
            String newSub = m_html.replaceAll("<$1>");
            returnString = returnString.replace(sub, newSub);
        }
        return returnString;
    }
}

久夏

关注

0
点赞
踩
5

收藏

觉得还不错? 一键收藏
0
评论
使用poi将word转为html

使用poi将word转为html 需求：将上传的word文档转为html并返回页面填充到富文本编辑器中使用方法： 1.openoffice出现问题：图片错位; 2.poi将word转为html；图片格式多样，如wmf，emf等文件格式不能在页面上显示; 3.上传文件限定为docx，更改后缀为zip，解压可得到所有图片并且格式为png，但html需使用其他方法获得再
复制链接

扫一扫