POI word转html

最新推荐文章于 2023-08-03 14:05:48 发布

M2L

最新推荐文章于 2023-08-03 14:05:48 发布

阅读量163

点赞数

分类专栏： POI opensagres

原文链接：https://www.cnblogs.com/always-online/p/4800131.html#commentform

版权

opensagres 同时被 2 个专栏收录

1 篇文章 0 订阅

订阅专栏

POI

0 篇文章 0 订阅

订阅专栏

转载文章来源：https://www.cnblogs.com/always-online/p/4800131.html#commentform

maven依赖 这里要注意的是依赖之间应保持版本一致，否则会导致包冲突。小白的我因此困扰挺久。

<dependencies>
        <!--doc2html start-->
        <dependency>
            <groupId>fr.opensagres.xdocreport</groupId>
            <artifactId>fr.opensagres.xdocreport.document</artifactId>
            <version>1.0.5</version>
        </dependency>
        <dependency>
            <groupId>fr.opensagres.xdocreport</groupId>
            <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>
            <version>1.0.5</version>
        </dependency>
        <!--doc2html end-->

        <!--docx2html start-->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>3.13</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>3.13</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>3.13</version>
        </dependency>
        <!--docx2html end-->
   </dependencies>

转换代码

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.core.FileURIResolver;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.*;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.util.List;



import static java.lang.Boolean.FALSE;
import static java.lang.Boolean.TRUE;

public class DocTypeConvertImpl {

    public DocTypeConvertImpl() {

    }

    /*
     * @param filePath word文档的路径
     *
     */
    public void docToHtml(String filePath) throws IOException, TransformerException, ParserConfigurationException {

        //word文档类型，docx类型为TURE，doc类型为FALSE
        boolean docType = FALSE;

        //生成html文件存放路径
        final String desPath = "D:\\";
        //html图片存放路径
        final String imagesPath = "D:\\";

        //获取文件名称
        String fileName = filePath.substring(filePath.lastIndexOf("\\") + 1);
        //获取文件前缀名
        String name = fileName.substring(0, fileName.lastIndexOf("."));

        //判断文件类型
        if (fileName.endsWith(".docx") || fileName.endsWith(".DOCX")) docType = TRUE;

        //开始转换
        if (docType) {//当word文档为docx类型时
            File f = new File(filePath);
            if (!f.exists()) {
                System.out.println("Sorry File does not Exists!");
            } else {
                if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {

                    // 1) 加载word文档生成 XWPFDocument对象
                    InputStream in = new FileInputStream(f);
                    XWPFDocument document = new XWPFDocument(in);

                    // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)
                    File imageFolderFile = new File(imagesPath);
                    XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));
                    options.setExtractor(new FileImageExtractor(imageFolderFile));
                    options.setIgnoreStylesIfUnused(false);
                    options.setFragment(true);

                    // 3) 将 XWPFDocument转换成XHTML
                    OutputStream out = new FileOutputStream(new File(desPath + name + ".html"));
                    XHTMLConverter.getInstance().convert(document, out, options);
                    out.close();
                }
            }

        } else if (!docType) { //当word文档为doc类型时
            InputStream input = new FileInputStream(new File(filePath));
            HWPFDocument wordDocument = new HWPFDocument(input);
            WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());

            //设置图片存放的位置
            wordToHtmlConverter.setPicturesManager(new PicturesManager() {
                public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {
                    File imgPath = new File(imagesPath);
                    if (!imgPath.exists()) {//图片目录不存在则创建
                        imgPath.mkdirs();
                    }
                    File file = new File(imagesPath + suggestedName);
                    try {
                        OutputStream os = new FileOutputStream(file);
                        os.write(content);
                        os.close();
                    } catch (FileNotFoundException e) {
                        e.printStackTrace();
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                    return imagesPath + suggestedName;
                }
            });

            //解析word文档
            wordToHtmlConverter.processDocument(wordDocument);
            Document htmlDocument = wordToHtmlConverter.getDocument();

            File htmlFile = new File(desPath + name+".html");
            OutputStream outStream = new FileOutputStream(htmlFile);

            DOMSource domSource = new DOMSource(htmlDocument);
            StreamResult streamResult = new StreamResult(outStream);

            TransformerFactory factory = TransformerFactory.newInstance();
            Transformer serializer = factory.newTransformer();
            serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
            serializer.setOutputProperty(OutputKeys.INDENT, "yes");
            serializer.setOutputProperty(OutputKeys.METHOD, "html");

            serializer.transform(domSource, streamResult);

            outStream.close();


        }


    }
}

M2L

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
POI word转html

转载文章来源：https://www.cnblogs.com/always-online/p/4800131.html#commentformmaven依赖这里要注意的是依赖之间应保持版本一致，否则会导致包冲突。小白的我因此困扰挺久。<dependencies>  <dependency&...
复制链接

扫一扫