使用spric.doc导入docx格式的word,并转换为html代码。
转换后会生成3个文件:xxx.html,xxx.css,以及存放word中图片的文件夹,需求是要求将样式以及图片一起放到xxx.html代码中,图片采用base64替换标签的src路径。
附官网链接: Spire.Doc for Java
maven依赖
<!-- https://mvnrepository.com/artifact/e-iceblue/spire.office -->
<dependency>
<groupId>e-iceblue</groupId>
<artifactId>spire.office.free</artifactId>
<version>5.3.1</version>
</dependency>
方法
/**
* java将word转为html,图片替换为base64代码
*/
public String importConclusion(String filePath, HttpServletResponse response){
File importFile = new File(filePath);
try {
Document doc = new Document();
doc.setKeepSameFormat(true);
doc.loadFromFile(importFile.getPath());
// 获取所有图片
List<BufferedImage> images = new ArrayList();
for (int i=0; i<doc.getSections().getCount(); i++){
int paragraphsCount = doc.getSections().get(i).getParagraphs().getCount();
// 遍历段落
for (int j = 0; j < paragraphsCount; j++){
Paragraph paragraph = doc.getSections().get(i).getParagraphs().get(j);
// 遍历段落中的子对象
for (int z = 0; z < paragraph.getChildObjects().getCount(); z++) {
Object obj = paragraph.getChildObjects().get(z);
// 判定是否为图片
if(obj instanceof DocPicture){
DocPicture picture = (DocPicture) obj;
images.add(picture.getImage());
}
}
}
}
// 生成的html等文件存放地址
File dirFile = new File(docxFilePath);
if(dirFile.exists()){
dirFile.mkdirs();
}
String uuid = UUID.randomUUID().toString();
doc.saveToFile(docxFilePath + uuid + "docx.html", FileFormat.Html);
doc.dispose();
// 读取CSS样式,拼接到html中
String cssStyle = readCss(docxFilePath + uuid + "docx_styles.css");
cssStyle = "<style>" + cssStyle + "</style>";
// 读取html内容
String htmlStr = readCss(docxFilePath + uuid + "docx.html");
htmlStr = htmlStr.replace("<link href=\""+uuid+"docx_styles.css\" type=\"text/css\" rel=\"stylesheet\"/>",cssStyle);
// 获取html中所有img标签,并替换为base64
List<String> imgList = new ArrayList<>();
Pattern imgPattern = Pattern.compile("<(img|IMG)(.*?)(/>|></img>|>)");
Matcher imgMatcher = imgPattern.matcher(htmlStr);
boolean isFound = imgMatcher.find();
if(isFound){
while (isFound){
String imgContent = imgMatcher.group(2);
Pattern srcPattern = Pattern.compile("(src|SRC)=(\"|\')(.*?)(\"|\')");
Matcher srcMatcher = srcPattern.matcher(imgContent);
if(srcMatcher.find()){
String srcAddr = srcMatcher.group(3);
imgList.add(srcAddr);
}
isFound = imgMatcher.find();
}
}
if(images.size() != 0 && images.size() == imgList.size()){
for(int i = 0;i<images.size();i++){
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
ImageIO.write(images.get(i),"png",outputStream);
String base64 = "data:image/" + "png" +";base64 ," + Base64.getEncoder().encodeToString(outputStream.toByteArray());
htmlStr = htmlStr.replace(imgList.get(i),base64);
}
}
// 删除生成的html文件
deleteAllFiles(docxFilePath);
return htmlStr;
} catch (Exception e) {
e.printStackTrace();
}
return "";
}
读取文件内容
/**
* 读取文件内容
* @param filePath
* @return
* @throws IOException
*/
public String readCss(String filePath) throws IOException {
File file = new File(filePath);
FileInputStream in = null;
String str = "";
try {
in = new FileInputStream(file);
byte[] b = new byte[in.available()];
in.read(b);
str = new String(b);
} catch (IOException e) {
e.printStackTrace();
} finally {
in.close();
}
return str;
}
删除文件夹及文件夹下的所有内容
/**
* 删除文件夹及文件夹下的所有内容
* @param dir
* @return
*/
public static boolean deleteAllFiles(String dir){
File dirFile = new File(dir);
if(!dirFile.exists() || !dirFile.isDirectory()){
return false;
}
boolean flag = true;
File[] files = dirFile.listFiles();
for(int i = 0; i<files.length;i++){
// 删除子文件
if(files[i].isFile()){
flag = files[i].delete();
}else if(files[i].isDirectory()){
// 删除子文件夹
flag = deleteAllFiles(files[i].getAbsolutePath());
}
}
// 删除该文件夹
dirFile.delete();
return flag;
}