POI解析Word批注信息

最新推荐文章于 2023-10-08 16:40:03 发布

灼烧的疯狂

最新推荐文章于 2023-10-08 16:40:03 发布

阅读量3.2k

点赞数 1

分类专栏： Java工具类文章标签： POI解析Word批注 Word批注解析

本文链接：https://blog.csdn.net/qq_33803102/article/details/100116602

版权

Java工具类专栏收录该内容

20 篇文章

订阅专栏

前言：报告审批后，要求解析Word批注信息获取作者、引用正文、批注内容等信息入库，我这边额外加了回复对象（为后续考虑）

先上代码吧：


/**
 * @author : weiheng
 * @version V1.0
 * @date Date : 2019年08月28日 10:31
 */
public class WordComment {

	/** 批注ID */
	private String id;

	/** 回复的目标批注ID（对哪条批注进行回复）*/
	private String replyFor;

	/** 批注引用的正文 */
	private String text;

	/** 批注内容 */
	private String comment;

	/** 批注人 */
	private String author;

	public String getId() {
		return id;
	}

	public WordComment setId(String id) {
		this.id = id;
		return this;
	}

	public String getReplyFor() {
		return replyFor;
	}

	public WordComment setReplyFor(String replyFor) {
		this.replyFor = replyFor;
		return this;
	}

	public String getText() {
		return text;
	}

	public WordComment setText(String text) {
		this.text = text;
		return this;
	}

	public String getComment() {
		return comment;
	}

	public WordComment setComment(String comment) {
		this.comment = comment;
		return this;
	}

	public String getAuthor() {
		return author;
	}

	public WordComment setAuthor(String author) {
		this.author = author;
		return this;
	}
}



import org.apache.poi.POIXMLDocument;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.usermodel.XWPFComment;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.jeecgframework.core.util.FtpUtils;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

import java.io.File;
import java.io.IOException;
import java.util.*;

/**
 * Word文件批注解析工具
 * @author : weiheng
 * @version V1.0
 * @date Date : 2019年08月27日 11:24
 */
public class WordCommentsParseUtil {

	/** text ? Word正文内容 */
	private final String BODY_WT = "w:t";
	/** picture ? 图片 */
	private final String BODY_PIC = "wp:docPr";
	/** 批注开始标识 */
	private final String BODY_COMMENT_START = "w:commentRangeStart";
	/** 批注结束标识 */
	private final String BODY_COMMENT_END = "w:commentRangeEnd";


	/** 加载远程文件（FTP文件服务器）内容产生的临时文件目录 */
	private static String usrDir = System.getProperty("user.dir");
	/** 批注所引用正文装配Map完毕标识 */
	private boolean commentFilledComplete = false;
	/** Word document */
	private XWPFDocument docx = null;
	/** 批注最大下标 */
	private int maxCommentIndex;
	/** 当前批注索引 */
	private int commentIndex;
	/** FTP服务器加载到本地的临时文件输出地址 */
	private String localOutputFile;
	/** docx数据存储对象 */
	private OPCPackage opcPackage;


	/**
	 *
	 * 初始化Word（XWPFDocument）文件对象
	 * @param remotePath 远程文件访问路径（文件服务器）
	 * @date 2019-08-27 11:54
	 * @author weiheng
	 */
	private void initDocument(String remotePath){
		try {

			String filename = remotePath.substring(remotePath.lastIndexOf("/") + 1);
			localOutputFile = usrDir + "/" + filename;
			FtpUtils.downloadFileFromFtp(remotePath, localOutputFile);

			opcPackage = POIXMLDocument.openPackage(localOutputFile);
			docx = new XWPFDocument(opcPackage);
			maxCommentIndex = docx.getComments().length - 1;
			commentIndex = 0;
		} catch (IOException e) {
			e.printStackTrace();
			throw new RuntimeException("初始化文件异常：" + e.getMessage());
		}
	}

	/**
	 *
	 * 获取服务器上文件的批注信息（ID、引用正文、批注内容、回复的批注对象）
	 * @param fileRemotePath FTP远程访问路径
	 * @date 2019-08-27 20:41
	 * @author weiheng
	 */
	public List<WordComment> getWordCommentsInfo(String fileRemotePath){

		// 1、初始化 XWPFDocument 对象
		initDocument(fileRemotePath);

		// 2、解析正文，获取 批注ID, 批注正文, 批注回复对象ID
		List<WordComment> comments = new ArrayList<>(10);
		fillComments(docx.getDocument().getDomNode(), new LinkedList<>(), new HashMap<>(16), comments);

		// 3、获取批注内容及作者
		for (WordComment c : comments) {
			XWPFComment comment = docx.getCommentByID(c.getId());
			c.setComment(comment.getText())
				.setAuthor(comment.getAuthor());
		}

		// 4、释放连接
		releaseSource();

		// 5、删除本地临时文件
		deleteTempFile();
		
		return comments;
	}

	/**
	 *
	 * 这里一定要释放Word文件的连接，否则临时文件无法删除
	 * @date 2019-08-28 9:59
	 * @author weiheng
	 */
	private void releaseSource() {
		try {
			if(opcPackage != null) {
				opcPackage.close();
			}
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	/**
	 *
	 * 删除本地临时文件
	 * @date 2019-08-27 20:41
	 * @author weiheng
	 */
	private boolean deleteTempFile(){

		boolean flag = false;
		File file = new File(localOutputFile);
		if(file.isFile() && file.exists()){
			flag = file.delete();
		}
		return flag;
	}

	/**
	 *
	 * 组装批注引用文本的Map，Map<批注ID, 批注引用的正文>
	 * @param node Word的XML节点
	 * @param ids 批注ID集合
	 * @param values 批注引用的正文文本的集合
	 * @param comments 要填充批注对象集合
	 * @date 2019-08-27 13:54
	 * @author weiheng
	 */
	private void fillComments(Node node, LinkedList<String> ids, Map<String, String> values, List<WordComment> comments){

		if(!insureNotNull(node, ids, values, comments)){
			throw new IllegalArgumentException(new StringBuilder()
				.append(this.getClass().getName())
				.append("fillComments(").append(node).append(",")
				.append(ids).append(",").append(values).append(",")
				.append(comments).append(")")
				.toString());
		}
		String nodeName = node.getNodeName();
		// 批注ID
		String id = ids.isEmpty() ? "" : ids.getFirst();
		if (BODY_WT.equals(nodeName) && id.length() > 0) {

			// text，批注引用的正文文本 - <id,text>
			values.put(id, node.getFirstChild().getNodeValue());

		} else if (BODY_PIC.equals(nodeName) && id.length() > 0) {

			// 图片上加的批注信息
			values.put(id, "[" + getAttribute(node, "name") + "]");

		} else if (BODY_COMMENT_START.equals(nodeName)) {

			// 添加批注ID - 批注引用开始
			ids.add(getAttribute(node, "w:id"));

		} else if (BODY_COMMENT_END.equals(nodeName) && id.length() > 0) {

			// 批注引用结束，添加信息到map中
			int size = ids.size();
			for (int i = 0; i < size ; i++) {
				String tempId = ids.get(i);
				if (!tempId.equals(getAttribute(node, "w:id"))) {
					continue;
				}
				WordComment wc = new WordComment()
									.setId(tempId)
									.setText(values.get(tempId));
				if(i != 0){
					String parentId = ids.get(i-1);
					wc.setReplyFor(parentId);
				} else {
					wc.setReplyFor("");
				}

				comments.add(wc);
				commentIndex++;
			}


			// 当前节点所有批注（含回复的批注）都添加了
			int added = 0;
			for(WordComment c : comments){
				if(ids.contains(c.getId())){
					added++;
				}
			}
			if(added == size){
				ids.clear();
				if (commentIndex == maxCommentIndex && ids.isEmpty()) {
					// 所有批注已经解析完毕
					commentFilledComplete = true;
				}
			}
		}

		// 是否有子节点，递归遍历
		if (node.hasChildNodes()) {
			NodeList temp = node.getChildNodes();
			for (int i = 0; i < temp.getLength(); i++) {
				if (commentFilledComplete) {
					break;
				}
				fillComments(temp.item(i), ids, values, comments);
			}
		}
	}

	private String getAttribute(Node node, String attName) {
		return (node.hasAttributes() && node.getAttributes().getNamedItem(
			attName) != null) ? node.getAttributes().getNamedItem(attName)
			.getNodeValue() : null;
	}
	
	/**
	 *
	 * 非空校验
	 * @param objects 随便了
	 * @return boolean 是否为null
	 * @date 2019-08-27 13:45
	 * @author weiheng
	 */
	private boolean insureNotNull(Object... objects) {
		for (Object object : objects) {
			if (object == null) {
				return false;
			}
		}
		return true;
	}

	// 测试
	public static void main(String[] args) {
		List<WordComment> comments = new WordCommentsParseUtil().getWordCommentsInfo("http://IP:PORT/文件路径");
		String separator = ",";
		for (WordComment c : comments) {
			System.out.println(c.getId() + separator + c.getText() + separator
				+ c.getComment() + separator + c.getReplyFor());
		}
	}
}

代码测试结果如下：

1是针对ID为0的批注的回复，所有这里没有引用的正文内容（2同理，是在1后追加的回复）