word转html

最新推荐文章于 2024-04-16 20:04:03 发布

派bigStart

最新推荐文章于 2024-04-16 20:04:03 发布

阅读量866

点赞数

分类专栏： java

本文链接：https://blog.csdn.net/yy476864889/article/details/92657943

版权

java 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

jar要注意导入的jar如果有的包缺少或者导入版本不对会在运行的时候报异常

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.html</groupId>
  <artifactId>html</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  
  
   <dependencies>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>3.14</version>
</dependency>

<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-scratchpad</artifactId>
    <version>3.14</version>
</dependency>

<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>3.14</version>
</dependency>

<dependency>
    <groupId>fr.opensagres.xdocreport</groupId>
    <artifactId>xdocreport</artifactId>
    <version>1.0.6</version>
</dependency>

<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml-schemas</artifactId>
    <version>3.14</version>
</dependency>

<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>ooxml-schemas</artifactId>
    <version>1.3</version>
</dependency>

<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.11.3</version>
</dependency>
   </dependencies>
</project>

doc转html

package com.html;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;

public class DocToHtml {
	
	public static String docToHtml() throws Exception {
	    File path = new File("C:/mb");
	    String imagePathStr = path.getAbsolutePath() + "\\static\\image\\";
	    String sourceFileName = path.getAbsolutePath() + "\\static\\test.doc";
	    String targetFileName = path.getAbsolutePath() + "\\static\\test2.html";
	    File file = new File(imagePathStr);
	    if(!file.exists()) {
	        file.mkdirs();
	    }
	    HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(sourceFileName));
	    org.w3c.dom.Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
	    WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);
	    //保存图片，并返回图片的相对路径
	    wordToHtmlConverter.setPicturesManager((content, pictureType, name, width, height) -> {
	        try (FileOutputStream out = new FileOutputStream(imagePathStr + name)) {
	            out.write(content);
	        } catch (Exception e) {
	            e.printStackTrace();
	        }
	        return "image/" + name;
	    });
	    wordToHtmlConverter.processDocument(wordDocument);
	    org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();
	    DOMSource domSource = new DOMSource(htmlDocument);
	    StreamResult streamResult = new StreamResult(new File(targetFileName));
	    TransformerFactory tf = TransformerFactory.newInstance();
	    Transformer serializer = tf.newTransformer();
	    serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
	    serializer.setOutputProperty(OutputKeys.INDENT, "yes");
	    serializer.setOutputProperty(OutputKeys.METHOD, "html");
	    serializer.transform(domSource, streamResult);
	    return targetFileName;
	}

}

调用

public class DocHtml {

	public static void main(String[] args) throws Exception {
		
		
	  String docxToHtml = DocToHtml.docToHtml();
	  String readfile = DocGetHtml.readfile(docxToHtml);
	  System.out.println(readfile);
	  int indexOf = readfile.indexOf("<body>");
	  String substring1 = readfile.substring(indexOf+6);
	  int indexOf2 = substring1.indexOf("</body>");
	  String substring2 = substring1.substring(0,indexOf2);
      System.out.println(substring2);
	}

}

docx转html

package com.html;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;

import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;




public class DocxToHtml {
	
	public static String docxToHtml() throws Exception {
	    File path = new File("C:/mb");
	    String imagePath = path.getAbsolutePath() + "\\static\\image";
	    String sourceFileName = path.getAbsolutePath() + "\\static\\test.docx";
	    String targetFileName = path.getAbsolutePath() + "\\static\\test.html";

	    OutputStreamWriter outputStreamWriter = null;
	    try {
	        XWPFDocument document = new XWPFDocument(new FileInputStream(sourceFileName));
	        XHTMLOptions options = XHTMLOptions.create();
	        // 存放图片的文件夹
	        options.setExtractor(new FileImageExtractor(new File(imagePath)));
	        // html中图片的路径
	        options.URIResolver(new BasicURIResolver("image"));
	        outputStreamWriter = new OutputStreamWriter(new FileOutputStream(targetFileName), "utf-8");
	        XHTMLConverter xhtmlConverter = (XHTMLConverter) XHTMLConverter.getInstance();
	        xhtmlConverter.convert(document, outputStreamWriter, options);
	    } finally {
	        if (outputStreamWriter != null) {
	            outputStreamWriter.close();
	        }
	    }
	    return targetFileName;
	}

}

调用

package com.html;

public class DocxHtml {

	public static void main(String[] args) throws Exception {
		
		
	  String docxToHtml = DocxToHtml.docxToHtml();
	  String readfile = DocGetHtml.readfile(docxToHtml);
	  System.out.println(readfile);
	  int indexOf = readfile.indexOf("<body>");
	  String substring1 = readfile.substring(indexOf+6);
	  int indexOf2 = substring1.indexOf("</body>");
	  String substring2 = substring1.substring(0,indexOf2);
      System.out.println(substring2);
	}

}

在实际项目里面是在gradle里面使用的，因为是大项目已经导了一些包然后没有全部导入，所以一直导致找不到包还是找了很久原因

实际项目这样使用

"org.apache.poi:poi:3.11",
               "org.apache.poi:poi-excelant:3.11",
               "org.apache.poi:poi-ooxml:3.11",
               "org.apache.poi:poi-ooxml-schemas:3.11",
               "org.apache.poi:poi-scratchpad:3.11",
               "org.apache.xmlbeans:xmlbeans:2.6.0",

compile("fr.opensagres.xdocreport:xdocreport:1.0.6")
compile("org.jsoup:jsoup:1.11.3")
compile("org.apache.poi:poi-scratchpad:3.12")
compile("org.apache.poi:ooxml-schemas:1.1")

在判断doc还是docx时是直接拿后缀来判断的，但是实际中可能有人是直接改后缀名或者其他的操作在doc时会报错说要用docx的方法，所以进行异常处理时调用docx的

HWPFDocument wordDocument=null;
   try{
       wordDocument = new HWPFDocument(fileInputStream);
   }catch (Exception e) {
       return docxToHtml(Filepath,SourceFilePath);

}

DocGetHtml.readfile的部分

package com.ly.mp.pvoa.news.service;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;

import org.jsoup.Jsoup;


public class HtmlFotStringUtil {
	
	
	public static String readfile(String filePath) {
	    File file = new File(filePath);
	    
	    
	    StringBuffer htmlSb = new StringBuffer();
	    org.jsoup.nodes.Document parse;
		try {
			parse = Jsoup.parse(file, "utf-8");
			htmlSb.append(parse.html());
		} catch (IOException e1) {
			// TODO 自动生成的 catch 块
			e1.printStackTrace();
		}
		  return htmlSb.toString();
	    /*
	    InputStream input = null;
	    try {
	        input = new FileInputStream(file);
	    } catch (FileNotFoundException e) {
	        e.printStackTrace();
	    }
	    StringBuffer buffer = new StringBuffer();
	    byte[] bytes = new byte[1024];
	    try {
	        for (int n; (n = input.read(bytes)) != -1;) {
	            buffer.append(new String(bytes, 0, n, "utf-8"));
	        }
	    } catch (IOException e) {
	        e.printStackTrace();
	    }
	    return buffer.toString();*/
	}

}

派bigStart

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
1
评论
word转html

jar要注意导入的jar如果有的包缺少或者导入版本不对会在运行的时候报异常<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4...
复制链接

扫一扫