java将word文档按照标题进行内容解析获取

1.需求说明

word文档需要按照标题(比如一级标题、二级标题)将标题下面的内容获取出来(html字符串),图片转成base64显示

2.代码实现

不说废话,直接上代码

package com.test.chapter;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Base64;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.aspose.words.Document;
import com.aspose.words.SaveFormat;

/**
 * @author zcc 解析word获取对应章节下面的数据
 */
public class Test
{

	public static void main(String[] args)
	{
		// word文件的路径
		String filePath = "F:\\chapter\\test.docx";
		InputStream input = null;

		try
		{
			// 先创建一个临时目录文件夹
			String tempPath = "F:\\chapter\\";
			File tempFile = new File(tempPath);
			if (!tempFile.exists())
			{
				tempFile.mkdirs();
			}

			input = new FileInputStream(filePath);
			Document doc = new Document(input);
			// 将word文档全量转成html显示的html文件路径
			String htmlFilePath = tempPath + "test.html";
			doc.save(htmlFilePath, SaveFormat.HTML);
			String htmlStr = getHtmlStrFromFile(htmlFilePath);
			org.jsoup.nodes.Document htmlDoc = Jsoup.parse(htmlStr);
			changeImageSrc(htmlDoc, tempPath);// 转换图片格式
			Map<String, String> map = exactContentFromHtml(htmlDoc, 2);
			String string = map.get("h1_标题名称");
			System.out.println(string);
		}
		catch (Exception e)
		{
			e.printStackTrace();
		}
		finally
		{
			IOUtils.closeQuietly(input);
		}
	}

	/**
	 * 从html中抽取出预规内容,章节对应章节的内容。
	 * @return key=h1_name/h2_name,value=html
	 */
	private static Map<String, String> exactContentFromHtml(org.jsoup.nodes.Document htmlDoc, int level)
	{
		Map<String, String> value = new LinkedHashMap<String, String>();
		try
		{
			Elements eleList = htmlDoc.getElementsByTag("h1");
			if (eleList == null || eleList.size() == 0)
			{
				throw new Exception("上传的文件中不存在一级标题,请检查!");
			}
			Element ele = eleList.get(0);
			String tempKey = "h1_" + ele.text();
			StringBuffer tempBuffer = new StringBuffer();
			while (true)
			{
				ele = ele.nextElementSibling();// 获取当前节点的下一个节点
				if (ele == null)
				{
					if (StringUtils.isNotEmpty(tempKey))
					{
						value.put(tempKey, tempBuffer.toString());
					}
					break;
				}
				String eleTagName = ele.tagName();// 标签名称
				if ("h1".equals(eleTagName))
				{
					if (StringUtils.isNotEmpty(tempKey))
					{
						value.put(tempKey, tempBuffer.toString());
						tempBuffer.setLength(0);
					}
					tempKey = "h1_" + removeNullChar(ele.text());
					continue;
				}
				if (level == 2)
				{
					if ("h2".equals(eleTagName))
					{
						if (StringUtils.isNotEmpty(tempKey))
						{
							value.put(tempKey, tempBuffer.toString());
							tempBuffer.setLength(0);
						}
						tempKey = "h2_" + removeNullChar(ele.text());
						continue;
					}
				}

				tempBuffer.append(ele.outerHtml());
			}
		}
		catch (Exception e)
		{
			e.printStackTrace();
		}
		return value;
	}

	/**
	 * 移除空字符串和*字符
	 * @param text
	 * @return
	 */
	private static String removeNullChar(String text)
	{
		if (text == null)
		{
			return null;
		}
		return text.replaceAll(" ", "").trim();
	}

	/**
	 * 修改图片的src,改为base64格式
	 * @param htmlDoc org.jsoup.nodes.Document
	 * @param file 文件路径
	 * @throws IOException
	 */
	private static void changeImageSrc(org.jsoup.nodes.Document htmlDoc, String file) throws IOException
	{
		Elements images = htmlDoc.getElementsByTag("img"); // 获取所有的image,转码。
		for (int i = 0; i < images.size(); i++)
		{
			Element tempImage = images.get(i);
			String tempSrc = tempImage.attr("src");
			tempImage.attr("src", imageConvertBase64(file + File.separator + tempSrc));
		}
	}

	/**
	 * 把图片转换成base64格式
	 * @return
	 * @throws IOException
	 */
	private static String imageConvertBase64(String filePath) throws IOException
	{
		InputStream in = null;
		byte[] data = null;
		try
		{
			String fileExtension = filePath.substring(filePath.lastIndexOf(".") + 1); // 获取文件后缀
			in = new FileInputStream(new File(filePath));
			data = new byte[in.available()];
			in.read(data);
			in.close();
			String enCoderContent = new String(Base64.getEncoder().encode(data));
			Pattern p = Pattern.compile("\\s*|\t|\r|\n");
			Matcher m = p.matcher(enCoderContent);
			enCoderContent = m.replaceAll("");
			return "data:image/" + fileExtension + ";base64," + enCoderContent;
		}
		catch (IOException e)
		{
			e.printStackTrace();
		}
		finally
		{
			if (in != null)
			{
				in.close();
			}
		}
		return null;
	}

	/**
	 * 从html文件中读取文件信息string
	 * @param filePath
	 * @return
	 * @throws IOException
	 */
	private static String getHtmlStrFromFile(String filePath) throws IOException
	{
		FileInputStream in = null;
		File file = new File(filePath);
		Long filelength = file.length();
		byte[] filecontent = new byte[filelength.intValue()];
		try
		{
			in = new FileInputStream(file);
			in.read(filecontent);
		}
		catch (IOException e)
		{
			e.printStackTrace();
		}
		finally
		{
			IOUtils.closeQuietly(in);
		}
		return new String(filecontent, "UTF-8");
	}
}

3.jar包

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值