利用POI工具将word文档转成html
1,导入对应的jar
2,编写相应的代码,直接上图
1)、
a,在src目录下创建helper目录
b,在helper目录下创建PoiHelperFactory.java文件
package helper;
import service.IDocHelper;
import service.impl.DocHeplerImpl;
/**
*
* @author yanzz
* poi静态工厂类
*
*/
public class PoiHelperFactory {
public static IDocHelper createDocHelper(){
return new DocHeplerImpl();
}
}
2)、
a, src目录下创建service目录以及在service目录下创建impl目录,
b, 分别创建出IDocHelper.java 以及 DocHelperImpl.java文件
package service;
public interface IDocHelper {
public String transformHtml(String sourceFilePath);
}
package service.impl;
import org.apache.log4j.Logger;
import service.IDocHelper;
import util.PoiUtil;
public class DocHeplerImpl implements IDocHelper {
//poi控制器类的日志
private static Logger logger = Logger.getLogger(DocHeplerImpl.class);
public String transformHtml(String sourceFilePath){
String xxName = PoiUtil.getVersion(sourceFilePath);
String result = "";
if("doc".equals(xxName)){
result = PoiUtil.docTransformHtml(sourceFilePath);
}else if("docx".equals(xxName)){
result = PoiUtil.docxTransformHtml(sourceFilePath);
}else{
logger.info("word文档的格式错误,,,请检查是否是word文档");
}
return result;
}
}
3)、
a,src目录下创建util目录,并创建PoiUtil.java
package util;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.log4j.Logger;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;
public class PoiUtil {
//poi控制器类的日志
private static Logger logger = Logger.getLogger(PoiUtil.class);
/**
* 这边先写死, 真正开发的时候可以将此处改成
* properties文件或者
* 数据局数据字典进行存储。
*/
public static final String IMAGEPATH ="D:/doc/images/";
/**
* 动态的生成picture的名称通过 UUID
*/
private static String getRandom(){
long curL = System.currentTimeMillis();
return String.valueOf(curL);
}
/**
* 对传入的文件的版本号进行判断
* 看返回的是docx
* 还是 doc
*/
public static String getVersion(String sourceFilePath){
int index = sourceFilePath.lastIndexOf(".");
sourceFilePath = sourceFilePath.substring(index+1);
return sourceFilePath;
}
/**
* 根据目标doc文件路径地址将word的内容打散成html格式的字符串 。
* @param sourceFilePath
* 源文件地址vg
*/
public static String docTransformHtml(String sourceFilePath){
//doc文档的html字符串
String htmlContent ="";
try{
//创建inputStream
InputStream input = new FileInputStream(sourceFilePath);
//创建HWPFDocument对象
HWPFDocument wordDocument = new HWPFDocument(input);
//获取WordToHtmlConverter对象
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument());
//保存路径这个路径也是html中 的图片引用路径
//<img src ="。。。。。。">
wordToHtmlConverter.setPicturesManager(new PicturesManager() {
public String savePicture(byte[] content, PictureType pictureType,
String suggestedName, float widthInches, float heightInches) {
return IMAGEPATH + suggestedName;
}
});
//设置processDocument
wordToHtmlConverter.processDocument(wordDocument);
//获取图片table列表
List pics = wordDocument.getPicturesTable().getAllPictures();
//将图片文件存入文件夹内
if(pics != null){
for(int i = 0; i < pics.size(); i ++){
Picture pic = (Picture)pics.get(i);
pic.writeImageContent(new FileOutputStream(IMAGEPATH+pic.suggestFullFileName()));
}
}
//获取html的document对象
Document docHtml = wordToHtmlConverter.getDocument();
//创建DOMSource对象
DOMSource domSource = new DOMSource(docHtml);
//创建StreamResult对象
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
StreamResult streamResult = new StreamResult(outStream);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
outStream.close();
htmlContent = new String(outStream.toByteArray(),"utf-8");
logger.debug("转化html字符串成功...");
}catch(Exception e){
e.printStackTrace();
}
return htmlContent;
}
/**
* 将docx文档打散成html格式的字符串
*/
public static String docxTransformHtml(String sourceFilePath){
String htmlContent = "";
try {
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
File docxFile = new File(sourceFilePath);
XWPFDocument wordDocument =
new XWPFDocument(new FileInputStream(docxFile));
XHTMLOptions options = XHTMLOptions.create()
.URIResolver(new BasicURIResolver(IMAGEPATH));
File imageFolderFile = new File(IMAGEPATH);
options.setExtractor(new FileImageExtractor(imageFolderFile));
XHTMLConverter.getInstance().convert(wordDocument, outStream, options);
//返回字符串
htmlContent = new String(outStream.toByteArray(),"utf-8");
} catch (FileNotFoundException e) {
logger.error("该路径下没有获取到文件,请检查路径是否正确");
e.printStackTrace();
} catch (IOException e) {
logger.error("后台报错...");
e.printStackTrace();
}
return htmlContent;
}
}
这边的对应的word文档以及文件目录可以修改自己的对应的word以及目录,根据具体的实际情况而定。
ps:
代码的百度网盘地址:https://pan.baidu.com/s/1o84UvKQ
个人微信号: 一起学习注明csdn博客就可以了