word转html

jar要注意导入的jar如果有的包缺少或者导入版本不对会在运行的时候报异常

 

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.html</groupId>
  <artifactId>html</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  
  
   <dependencies>
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>3.14</version>
</dependency>

<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-scratchpad</artifactId>
    <version>3.14</version>
</dependency>

<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>3.14</version>
</dependency>

<dependency>
    <groupId>fr.opensagres.xdocreport</groupId>
    <artifactId>xdocreport</artifactId>
    <version>1.0.6</version>
</dependency>

<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml-schemas</artifactId>
    <version>3.14</version>
</dependency>

<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>ooxml-schemas</artifactId>
    <version>1.3</version>
</dependency>

<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.11.3</version>
</dependency>
   </dependencies>
</project>

doc转html

package com.html;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;

public class DocToHtml {
	
	public static String docToHtml() throws Exception {
	    File path = new File("C:/mb");
	    String imagePathStr = path.getAbsolutePath() + "\\static\\image\\";
	    String sourceFileName = path.getAbsolutePath() + "\\static\\test.doc";
	    String targetFileName = path.getAbsolutePath() + "\\static\\test2.html";
	    File file = new File(imagePathStr);
	    if(!file.exists()) {
	        file.mkdirs();
	    }
	    HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(sourceFileName));
	    org.w3c.dom.Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
	    WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);
	    //保存图片,并返回图片的相对路径
	    wordToHtmlConverter.setPicturesManager((content, pictureType, name, width, height) -> {
	        try (FileOutputStream out = new FileOutputStream(imagePathStr + name)) {
	            out.write(content);
	        } catch (Exception e) {
	            e.printStackTrace();
	        }
	        return "image/" + name;
	    });
	    wordToHtmlConverter.processDocument(wordDocument);
	    org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();
	    DOMSource domSource = new DOMSource(htmlDocument);
	    StreamResult streamResult = new StreamResult(new File(targetFileName));
	    TransformerFactory tf = TransformerFactory.newInstance();
	    Transformer serializer = tf.newTransformer();
	    serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
	    serializer.setOutputProperty(OutputKeys.INDENT, "yes");
	    serializer.setOutputProperty(OutputKeys.METHOD, "html");
	    serializer.transform(domSource, streamResult);
	    return targetFileName;
	}

}

调用

public class DocHtml {

	public static void main(String[] args) throws Exception {
		
		
	  String docxToHtml = DocToHtml.docToHtml();
	  String readfile = DocGetHtml.readfile(docxToHtml);
	  System.out.println(readfile);
	  int indexOf = readfile.indexOf("<body>");
	  String substring1 = readfile.substring(indexOf+6);
	  int indexOf2 = substring1.indexOf("</body>");
	  String substring2 = substring1.substring(0,indexOf2);
      System.out.println(substring2);
	}

}

docx转html

package com.html;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.OutputStreamWriter;

import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;




public class DocxToHtml {
	
	public static String docxToHtml() throws Exception {
	    File path = new File("C:/mb");
	    String imagePath = path.getAbsolutePath() + "\\static\\image";
	    String sourceFileName = path.getAbsolutePath() + "\\static\\test.docx";
	    String targetFileName = path.getAbsolutePath() + "\\static\\test.html";

	    OutputStreamWriter outputStreamWriter = null;
	    try {
	        XWPFDocument document = new XWPFDocument(new FileInputStream(sourceFileName));
	        XHTMLOptions options = XHTMLOptions.create();
	        // 存放图片的文件夹
	        options.setExtractor(new FileImageExtractor(new File(imagePath)));
	        // html中图片的路径
	        options.URIResolver(new BasicURIResolver("image"));
	        outputStreamWriter = new OutputStreamWriter(new FileOutputStream(targetFileName), "utf-8");
	        XHTMLConverter xhtmlConverter = (XHTMLConverter) XHTMLConverter.getInstance();
	        xhtmlConverter.convert(document, outputStreamWriter, options);
	    } finally {
	        if (outputStreamWriter != null) {
	            outputStreamWriter.close();
	        }
	    }
	    return targetFileName;
	}

}

 

调用

package com.html;

public class DocxHtml {

	public static void main(String[] args) throws Exception {
		
		
	  String docxToHtml = DocxToHtml.docxToHtml();
	  String readfile = DocGetHtml.readfile(docxToHtml);
	  System.out.println(readfile);
	  int indexOf = readfile.indexOf("<body>");
	  String substring1 = readfile.substring(indexOf+6);
	  int indexOf2 = substring1.indexOf("</body>");
	  String substring2 = substring1.substring(0,indexOf2);
      System.out.println(substring2);
	}

}

 

在实际项目里面是在gradle里面使用的,因为是大项目已经导了一些包然后没有全部导入,所以一直导致找不到包还是找了很久原因

实际项目这样使用

                   "org.apache.poi:poi:3.11",
                    "org.apache.poi:poi-excelant:3.11",
                    "org.apache.poi:poi-ooxml:3.11",
                    "org.apache.poi:poi-ooxml-schemas:3.11",
                    "org.apache.poi:poi-scratchpad:3.11",
                    "org.apache.xmlbeans:xmlbeans:2.6.0",

      compile("fr.opensagres.xdocreport:xdocreport:1.0.6")
    compile("org.jsoup:jsoup:1.11.3") 
    compile("org.apache.poi:poi-scratchpad:3.12") 
    compile("org.apache.poi:ooxml-schemas:1.1") 

 

 

在判断doc还是docx时是直接拿后缀来判断的,但是实际中可能有人是直接改后缀名或者其他的操作在doc时会报错说要用docx的方法,所以进行异常处理时调用docx的

 

    HWPFDocument wordDocument=null;
        try{
             wordDocument = new HWPFDocument(fileInputStream);
        }catch (Exception e) {
            return docxToHtml(Filepath,SourceFilePath);

        }

 

 

 

 

 

DocGetHtml.readfile的部分

 

package com.ly.mp.pvoa.news.service;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;

import org.jsoup.Jsoup;


public class HtmlFotStringUtil {
	
	
	public static String readfile(String filePath) {
	    File file = new File(filePath);
	    
	    
	    StringBuffer htmlSb = new StringBuffer();
	    org.jsoup.nodes.Document parse;
		try {
			parse = Jsoup.parse(file, "utf-8");
			htmlSb.append(parse.html());
		} catch (IOException e1) {
			// TODO 自动生成的 catch 块
			e1.printStackTrace();
		}
		  return htmlSb.toString();
	    /*
	    InputStream input = null;
	    try {
	        input = new FileInputStream(file);
	    } catch (FileNotFoundException e) {
	        e.printStackTrace();
	    }
	    StringBuffer buffer = new StringBuffer();
	    byte[] bytes = new byte[1024];
	    try {
	        for (int n; (n = input.read(bytes)) != -1;) {
	            buffer.append(new String(bytes, 0, n, "utf-8"));
	        }
	    } catch (IOException e) {
	        e.printStackTrace();
	    }
	    return buffer.toString();*/
	}

}

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值