POI转Word为HTML

最新推荐文章于 2024-08-10 15:43:39 发布

D丶ream

最新推荐文章于 2024-08-10 15:43:39 发布

阅读量244

点赞数 1

分类专栏： Word文档操作文章标签： POI POI读取Word java读取Word Word转为HTML

本文链接：https://blog.csdn.net/h1059141989/article/details/102567470

版权

Word文档操作专栏收录该内容

3 篇文章 0 订阅

订阅专栏

使用POI操作Word最为方便，但是格式支持最不完善

引入jar包

<dependency>
	<groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>4.1.0</version>
</dependency>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>4.1.0</version>
</dependency>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml-schemas</artifactId>
    <version>4.1.0</version>
</dependency>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-scratchpad</artifactId>
    <version>4.1.0</version>
</dependency>
<!-- docx office -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>ooxml-schemas</artifactId>
    <version>1.4</version>
</dependency>
<dependency>
    <groupId>fr.opensagres.xdocreport</groupId>
    <artifactId>xdocreport</artifactId>
    <version>2.0.2</version>
</dependency>

代码

import java.io.*;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

import fr.opensagres.poi.xwpf.converter.core.BasicURIResolver;
import fr.opensagres.poi.xwpf.converter.core.FileImageExtractor;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;

public class WordUtils {

    public static void main(String[] args) {
        String filepath = "C:\\Users\\xxx\\Desktop\\xxx.doc";
        String htmlFile = analysisWord(filepath);
        System.out.println(htmlFile);
    }

    /**
     * 根据word路径名称将word转化为html文件并返回文件路径</p>
     */
    private static String analysisWord(String filepath){
        String htmlpath = "";
        File wordFile = new File(filepath);
        if(wordFile.exists()){
            if(filepath.endsWith(".doc")){
                htmlpath = docToHtml(filepath);
            }else if(filepath.endsWith(".docx")){
                htmlpath = docxToHtml(filepath);
            }else{
                System.out.println("此文件不是word文件！");
            }
        }
        return htmlpath;
    }

    /**
     * docx文件转html文件
     */
    private static String docxToHtml(String filepath) {
        System.out.println("======word附件路径：filepath"+filepath);

        String timemill = System.currentTimeMillis()+"";
        String tempPath = new File(filepath).getParent()+File.separator+timemill+File.separator;
        if(!new File(tempPath).exists()){
            new File(tempPath).mkdirs();
        }
        String targetFileName = tempPath + File.separator + timemill+".html";
        String imagePath = tempPath + File.separator + "image" + timemill + File.separator;

        OutputStreamWriter outputStreamWriter = null;
        try {
            XWPFDocument document = new XWPFDocument(new FileInputStream(filepath));
            XHTMLOptions options = XHTMLOptions.create();
            // 存放图片的文件夹
            options.setExtractor(new FileImageExtractor(new File(imagePath)));
            // html中图片的路径
            options.URIResolver(new BasicURIResolver("image"+timemill));
            outputStreamWriter = new OutputStreamWriter(new FileOutputStream(targetFileName), "utf-8");
            XHTMLConverter xhtmlConverter = (XHTMLConverter) XHTMLConverter.getInstance();
            xhtmlConverter.convert(document, outputStreamWriter, options);
        }catch (IOException e){
            targetFileName = "";
            System.out.println("============docx文件解析出错！============");
            e.printStackTrace();
        }  finally {
            if (outputStreamWriter != null) {
                try {
                    outputStreamWriter.close();
                }catch (IOException e) { }
            }
        }
        return targetFileName;
    }

    /**
     * doc文件转html文件
     */
    private static String docToHtml(String filepath){
        System.out.println("======word附件路径：filepath"+filepath);

        String timemill = System.currentTimeMillis()+"";
        String tempPath = new File(filepath).getParent()+File.separator+timemill+File.separator;
        if(!new File(tempPath).exists()){
            new File(tempPath).mkdirs();
        }
        String targetFileName = tempPath + File.separator + timemill+".html";
        String imagePathStr = tempPath + File.separator + "image" + timemill + File.separator;
        if(!new File(imagePathStr).exists()) {
            new File(imagePathStr).mkdirs();
        }

        try {
            HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(filepath));
            org.w3c.dom.Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
            WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);
            //保存图片，并返回图片的相对路径
            wordToHtmlConverter.setPicturesManager(new PicturesManager() {
                @Override
                public String savePicture(byte[] content, PictureType pictureType, String name, float width, float height) {
                    try{
                        FileOutputStream out = new FileOutputStream(imagePathStr + name);
                        out.write(content);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                    return "image" + timemill + File.separator + name;
                }
            });
            wordToHtmlConverter.processDocument(wordDocument);
            org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();
            DOMSource domSource = new DOMSource(htmlDocument);
            StreamResult streamResult = new StreamResult(new File(targetFileName));
            TransformerFactory tf = TransformerFactory.newInstance();
            Transformer serializer = tf.newTransformer();
            serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
            serializer.setOutputProperty(OutputKeys.INDENT, "yes");
            serializer.setOutputProperty(OutputKeys.METHOD, "html");
            serializer.transform(domSource, streamResult);
        }catch (Exception e) {
            targetFileName = "";
            System.out.println("============docx文件解析出错！============");
            e.printStackTrace();
        }
        return targetFileName;
    }

}