1.需求说明
word文档需要按照标题(比如一级标题、二级标题)将标题下面的内容获取出来(html字符串),图片转成base64显示
2.代码实现
不说废话,直接上代码
package com.test.chapter;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Base64;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.aspose.words.Document;
import com.aspose.words.SaveFormat;
/**
* @author zcc 解析word获取对应章节下面的数据
*/
public class Test
{
public static void main(String[] args)
{
// word文件的路径
String filePath = "F:\\chapter\\test.docx";
InputStream input = null;
try
{
// 先创建一个临时目录文件夹
String tempPath = "F:\\chapter\\";
File tempFile = new File(tempPath);
if (!tempFile.exists())
{
tempFile.mkdirs();
}
input = new FileInputStream(filePath);
Document doc = new Document(input);
// 将word文档全量转成html显示的html文件路径
String htmlFilePath = tempPath + "test.html";
doc.save(htmlFilePath, SaveFormat.HTML);
String htmlStr = getHtmlStrFromFile(htmlFilePath);
org.jsoup.nodes.Document htmlDoc = Jsoup.parse(htmlStr);
changeImageSrc(htmlDoc, tempPath);// 转换图片格式
Map<String, String> map = exactContentFromHtml(htmlDoc, 2);
String string = map.get("h1_标题名称");
System.out.println(string);
}
catch (Exception e)
{
e.printStackTrace();
}
finally
{
IOUtils.closeQuietly(input);
}
}
/**
* 从html中抽取出预规内容,章节对应章节的内容。
* @return key=h1_name/h2_name,value=html
*/
private static Map<String, String> exactContentFromHtml(org.jsoup.nodes.Document htmlDoc, int level)
{
Map<String, String> value = new LinkedHashMap<String, String>();
try
{
Elements eleList = htmlDoc.getElementsByTag("h1");
if (eleList == null || eleList.size() == 0)
{
throw new Exception("上传的文件中不存在一级标题,请检查!");
}
Element ele = eleList.get(0);
String tempKey = "h1_" + ele.text();
StringBuffer tempBuffer = new StringBuffer();
while (true)
{
ele = ele.nextElementSibling();// 获取当前节点的下一个节点
if (ele == null)
{
if (StringUtils.isNotEmpty(tempKey))
{
value.put(tempKey, tempBuffer.toString());
}
break;
}
String eleTagName = ele.tagName();// 标签名称
if ("h1".equals(eleTagName))
{
if (StringUtils.isNotEmpty(tempKey))
{
value.put(tempKey, tempBuffer.toString());
tempBuffer.setLength(0);
}
tempKey = "h1_" + removeNullChar(ele.text());
continue;
}
if (level == 2)
{
if ("h2".equals(eleTagName))
{
if (StringUtils.isNotEmpty(tempKey))
{
value.put(tempKey, tempBuffer.toString());
tempBuffer.setLength(0);
}
tempKey = "h2_" + removeNullChar(ele.text());
continue;
}
}
tempBuffer.append(ele.outerHtml());
}
}
catch (Exception e)
{
e.printStackTrace();
}
return value;
}
/**
* 移除空字符串和*字符
* @param text
* @return
*/
private static String removeNullChar(String text)
{
if (text == null)
{
return null;
}
return text.replaceAll(" ", "").trim();
}
/**
* 修改图片的src,改为base64格式
* @param htmlDoc org.jsoup.nodes.Document
* @param file 文件路径
* @throws IOException
*/
private static void changeImageSrc(org.jsoup.nodes.Document htmlDoc, String file) throws IOException
{
Elements images = htmlDoc.getElementsByTag("img"); // 获取所有的image,转码。
for (int i = 0; i < images.size(); i++)
{
Element tempImage = images.get(i);
String tempSrc = tempImage.attr("src");
tempImage.attr("src", imageConvertBase64(file + File.separator + tempSrc));
}
}
/**
* 把图片转换成base64格式
* @return
* @throws IOException
*/
private static String imageConvertBase64(String filePath) throws IOException
{
InputStream in = null;
byte[] data = null;
try
{
String fileExtension = filePath.substring(filePath.lastIndexOf(".") + 1); // 获取文件后缀
in = new FileInputStream(new File(filePath));
data = new byte[in.available()];
in.read(data);
in.close();
String enCoderContent = new String(Base64.getEncoder().encode(data));
Pattern p = Pattern.compile("\\s*|\t|\r|\n");
Matcher m = p.matcher(enCoderContent);
enCoderContent = m.replaceAll("");
return "data:image/" + fileExtension + ";base64," + enCoderContent;
}
catch (IOException e)
{
e.printStackTrace();
}
finally
{
if (in != null)
{
in.close();
}
}
return null;
}
/**
* 从html文件中读取文件信息string
* @param filePath
* @return
* @throws IOException
*/
private static String getHtmlStrFromFile(String filePath) throws IOException
{
FileInputStream in = null;
File file = new File(filePath);
Long filelength = file.length();
byte[] filecontent = new byte[filelength.intValue()];
try
{
in = new FileInputStream(file);
in.read(filecontent);
}
catch (IOException e)
{
e.printStackTrace();
}
finally
{
IOUtils.closeQuietly(in);
}
return new String(filecontent, "UTF-8");
}
}