docx4j 对比word

最新推荐文章于 2024-06-05 16:48:26 发布
samir张三
最新推荐文章于 2024-06-05 16:48:26 发布
阅读量1.3k
点赞数 2
分类专栏： java 文章标签： Jacob java word
本文链接：https://blog.csdn.net/u013786328/article/details/81077551
版权
本文介绍如何在Java中使用docx4j库对比两个Word文档，包括表格和段落的数据，并展示了如何显示批注差异。文章提供maven和gradle的依赖引入方式，并给出了具体代码示例。
摘要由CSDN通过智能技术生成
在java对比两份word文档（包括表格里面的数据，表格里的数据和段落里的数据需要分别处理，详情看后面代码），增加批注显示差异。
效果图如下
需引入docx4j jar ，版本可拿最新稳定版。
maven 引入
gradle 引入
compile group: 'org.docx4j', name: 'docx4j', version: '3.3.7'
直接上代码
package com.example.word;

import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Diff_match_patch {

  // Defaults.
  // Set these on your diff_match_patch instance to override the defaults.

  /**
   * Number of seconds to map a diff before giving up (0 for infinity).
   */
  public float Diff_Timeout = 1.0f;
  /**
   * Cost of an empty edit operation in terms of edit characters.
   */
  public short Diff_EditCost = 4;
  /**
   * The size beyond which the double-ended diff activates.
   * Double-ending is twice as fast, but less accurate.
   */
  public short Diff_DualThreshold = 32;
  /**
   * At what point is no match declared (0.0 = perfection, 1.0 = very loose).
   */
  public float Match_Threshold = 0.5f;
  /**
   * How far to search for a match (0 = exact location, 1000+ = broad match).
   * A match this many characters away from the expected location will add
   * 1.0 to the score (0.0 is a perfect match).
   */
  public int Match_Distance = 1000;
  /**
   * When deleting a large block of text (over ~64 characters), how close does
   * the contents have to match the expected contents. (0.0 = perfection,
   * 1.0 = very loose).  Note that Match_Threshold controls how closely the
   * end points of a delete need to match.
   */
  public float Patch_DeleteThreshold = 0.5f;
  /**
   * Chunk size for context length.
   */
  public short Patch_Margin = 4;
  /**
   * The number of bits in an int.
   */
  private int Match_MaxBits = 32;

  /**
   * Internal class for returning results from diff_linesToChars().
   * Other less paranoid languages just use a three-element array.
   */
  protected static class LinesToCharsResult {
    protected String chars1;
    protected String chars2;
    protected List<String> lineArray;

    protected LinesToCharsResult(String chars1, String chars2,
        List<String> lineArray) {
      this.chars1 = chars1;
      this.chars2 = chars2;
      this.lineArray = lineArray;
    }
  }
  //  DIFF FUNCTIONS
  /**
   * The data structure representing a diff is a Linked list of Diff objects:
   * {Diff(Operation.DELETE, "Hello"), Diff(Operation.INSERT, "Goodbye"),
   *  Diff(Operation.EQUAL, " world.")}
   * which means: delete "Hello", add "Goodbye" and keep " world."
   */
  public enum Operation {
    DELETE, INSERT, EQUAL
  }
  /**
   * Find the differences between two texts.
   * Run a faster slightly less optimal diff
   * This method allows the 'checklines' of diff_main() to be optional.
   * Most of the time checklines is wanted, so default to true.
   * @param text1 Old string to be diffed.
   * @param text2 New string to be diffed.
   * @return Linked List of Diff objects.
   */
  public LinkedList<Diff> diff_main(String text1, String text2) {
    return diff_main(text1, text2, true);
  }
  /**
   * Find the differences between two texts.  Simplifies the problem by
   * stripping any common prefix or suffix off the texts before diffing.
   * @param text1 Old string to be diffed.
   * @param text2 New string to be diffed.
   * @param checklines Speedup flag.  If false, then don't run a
   *     line-level diff first to identify the changed areas.
   *     If true, then run a faster slightly less optimal diff
   * @return Linked List of Diff objects.
   */
  public LinkedList<Diff> diff_main(String text1, String text2, boolean checklines) {
    // Check for equality (speedup)
    LinkedList<Diff> diffs;
    if (text1.equals(text2)) {
      diffs = new LinkedList<Diff>();
      diffs.add(new Diff(Operation.EQUAL, text1));
      return diffs;
    }
    // Trim off common prefix (speedup)
    int commonlength = diff_commonPrefix(text1, text2);
    String commonprefix = text1.substring(0, commonlength);
    text1 = text1.substring(commonlength);
    text2 = text2.substring(commonlength);
    // Trim off common suffix (speedup)
    commonlength = diff_commonSuffix(text1, text2);
    String commonsuffix = text1.substring(text1.length() - commonlength);
    text1 = text1.substring(0, text1.length() - commonlength);
    text2 = text2.substring(0, text2.length() - commonlength);
    // Compute the diff on the middle block
    diffs = diff_compute(text1, text2, checklines);

    // Restore the prefix and suffix
    if (commonprefix.length() != 0) {
      diffs.addFirst(new Diff(Operation.EQUAL, commonprefix));
    }
    if (commonsuffix.length() != 0) {
      diffs.addLast(new Diff(Operation.EQUAL, commonsuffix));
    }
    diff_cleanupMerge(diffs);
    return diffs;
  }
  /**
   * Find the differences between two texts.  Assumes that the texts do not
   * have any common prefix or suffix.
   * @param text1 Old string to be diffed.
   * @param text2 New string to be diffed.
   * @param checklines Speedup flag.  If false, then don't run a
   *     line-level diff first to identify the changed areas.
   *     If true, then run a faster slightly less optimal diff
   * @return Linked List of Diff objects.
   */
  protected LinkedList<Diff> diff_compute(String text1, String text2,boolean checklines) {
    LinkedList<Diff> diffs = new LinkedList<Diff>();
    if (text1.length() == 0) {
      // Just add some text (speedup)
      diffs.add(new Diff(Operation.INSERT, text2));
      return diffs;
    }
    if (text2.length() == 0) {
      // Just delete some text (speedup)
      diffs.add(new Diff(Operation.DELETE, text1));
      return diffs;
    }
    String longtext = text1.length() > text2.length() ? text1 : text2;
    String shorttext = text1.length() > text2.length() ? text2 : text1;
    int i = longtext.indexOf(shorttext);
    if (i != -1) {
      // Shorter text is inside the longer text (speedup)
      Operation op = (text1.length() > text2.length()) ?
                     Operation.DELETE : Operation.INSERT;
      diffs.add(new Diff(op, longtext.substring(0, i)));
      diffs.add(new Diff(Operation.EQUAL, shorttext));
      diffs.add(new Diff(op, longtext.substring(i + shorttext.length())));
      return diffs;
    }
    longtext = shorttext = null;  // Garbage collect.
    // Check to see if the problem can be split in two.
    String[] hm = diff_halfMatch(text1, text2);
    if (hm != null) {
      // A half-match was found, sort out the return data.
      String text1_a = hm[0];
      String text1_b = hm[1];
      String text2_a = hm[2];
      String text2_b = hm[3];
      String mid_common = hm[4];
      // Send both pairs off for separate processing.
      LinkedList<Diff> diffs_a = diff_main(text1_a, text2_a, checklines);
      LinkedList<Diff> diffs_b = diff_main(text1_b, text2_b, checklines);
      // Merge the results.
      diffs = diffs_a;
      diffs.add(new Diff(Operation.EQUAL, mid_common));
      diffs.addAll(diffs_b);
      return diffs;
    }
    // Perform a real diff.
    if (checklines && (text1.length() < 100 || text2.length() < 100)) {
      checklines = false;  // Too trivial for the overhead.
    }
    List<String> linearray = null;
    if (checklines) {
      // Scan the text on a line-by-line basis first.
      LinesToCharsResult b = diff_linesToChars(text1, text2);
      text1 = b.chars1;
      text2 = b.chars2;
      linearray = b.lineArray;
    }
    diffs = diff_map(text1, text2);
    if (diffs == null) {
      // No acceptable result.
      diffs = new LinkedList<Diff>();
      diffs.add(new Diff(Operation.DELETE, text1));
      diffs.add(new Diff(Operation.INSERT, text2));
    }
    if (checklines) {
      // Convert the diff back to original text.
      diff_charsToLines(diffs, linearray);
      // Eliminate freak matches (e.g. blank lines)
      diff_cleanupSemantic(diffs);
      // Rediff any replacement blocks, this time character-by-character.
      // Add a dummy entry at the end.
      diffs.add(new Diff(Operation.EQUAL, ""));
      int count_delete = 0;
      int count_insert = 0;
      String text_delete = "";
      String text_insert = "";
      ListIterator<Diff> pointer = diffs.listIterator();
      Diff thisDiff = pointer.next();
      while (thisDiff != null) {
        switch (thisDiff.operation) {
        case INSERT:
          count_insert++;
          text_insert += thisDiff.text;
          break;
        case DELETE:
          count_delete++;
          text_delete += thisDiff.text;
          break;
        case EQUAL:
          // Upon reaching an equality, check for prior redundancies.
          if (count_delete >= 1 && count_insert >= 1) {
            // Delete the offending records and add the merged ones.
            pointer.previous();
            for (int j = 0; j < count_delete + count_insert; j++) {
              pointer.previous();
              pointer.remove();
            }
            for (Diff newDiff : diff_main(text_delete, text_insert, false)) {
              pointer.add(newDiff);
            }
          }
          count_insert = 0;
          count_delete = 0;
          text_delete = "";
          text_insert = "";
          break;
        }
        thisDiff = pointer.hasNext() ? pointer.next() : null;
      }
      diffs.removeLast();  // Remove the dummy entry at the end.
    }
    return diffs;
  }
  /**
   * Split two texts into a list of strings.  Reduce the texts to a string of
   * hashes where each Unicode character represents one line.
   * @param text1 First string.
   * @param text2 Second string.
   * @return An object containing the encoded text1, the encoded text2 and
   *     the List of unique strings.  The zeroth element of the List of
   *     unique strings is intentionally blank.
   */
  protected LinesToCharsResult diff_linesToChars(String text1, String text2) {
    List<String> lineArray = new ArrayList<String>();
    Map<String, Integer> lineHash = new HashMap<String, Integer>();
    // e.g. linearray[4] == "Hello\n"
    // e.g. linehash.get("Hello\n") == 4
    // "\x00" is a valid character, but various debuggers don't like it.
    // So we'll insert a junk entry to avoid generating a null character.
    lineArray.add("");
    String chars1 = diff_linesToCharsMunge(text1, lineArray, lineHash);
    String chars2 = diff_linesToCharsMunge(text2, lineArray, lineHash);
    return new LinesToCharsResult(chars1, chars2, lineArray);
  }
  /**
   * Split a text into a list of strings.  Reduce the texts to a string of
   * hashes where each Unicode character represents one line.
   * @param text String to encode.
   * @param lineArray List of unique strings.
   * @param lineHash Map of strings to indices.
   * @return Encoded string.
   */
  private String diff_linesToCharsMunge(String text, List<String> lineArray,
                                        Map<String, Integer> lineHash) {
    int lineStart = 0;
    int lineEnd = -1;
    String line;
    StringBuilder chars = new StringBuilder();
    // Walk the text, pulling out a substring for each line.
    // text.split('\n') would would temporarily double our memory footprint.
    // Modifying text would create many large strings to garbage collect.
    while (lineEnd < text.length() - 1) {
      lineEnd = text.indexOf('\n', lineStart);
      if (lineEnd == -1) {
        lineEnd = text.length() - 1;
      }
      line = text.substring(lineStart, lineEnd + 1);
      lineStart = lineEnd + 1;

      if (lineHash.containsKey(line)) {
        chars.append(String.valueOf((char) (int) lineHash.get(line)));
      } else {
        lineArray.add(line);
        lineHash.put(line, lineArray.size() - 1);
        chars.append(String.valueOf((char) (lineArray.size() - 1)));
      }
    }
    return chars.toString();
  }
  /**
   * Rehydrate the text in a diff from a string of line hashes to real lines of
   * text.
   * @param diffs LinkedList of Diff objects.
   * @param lineArray List of unique strings.
   */
  protected void diff_charsToLines(LinkedList<Diff> diffs,
                                  List<String> lineArray) {
    StringBuilder text;
    for (Diff diff : diffs) {
      text = new StringBuilder();
      for (int y = 0; y < diff.text.length(); y++) {
        text.append(lineArray.get(diff.text.charAt(y)));
      }
      diff.text = text.toString();
    }
  }
  /**
   * Explore the intersection points between the two texts.
   * @param text1 Old string to be diffed.
   * @param text2 New string to be diffed.
   * @return LinkedList of Diff objects or null if no diff available.
   */
  protected LinkedList<Diff> diff_map(String text1, String text2) {
    long ms_end = System.currentTimeMillis() + (long) (Diff_Timeout * 1000);
    // Cache the text lengths to prevent multiple calls.
    int text1_length = text1.length();
    int text2_length = text2.length();
    int max_d = text1_length + text2_length - 1;
    boolean doubleEnd = Diff_DualThreshold * 2 < max_d;
    List<Set<Long>> v_map1 = new ArrayList<Set<Long>>();
    List<Set<Long>> v_map2 = new ArrayList<Set<Long>>();
    Map<Integer, Integer> v1 = new HashMap<Integer, Integer>();
    Map<Integer, Integer> v2 = new HashMap<Integer, Integer>();
    v1.put(1, 0);
    v2.put(1, 0);
    int x, y;
    Long footstep = 0L;  // Used to track overlapping paths.
    Map<Long, Integer> footsteps = new HashMap<Long, Integer>();
    boolean done = false;
    // If the total number of characters is odd, then the front path will
    // collide with the reverse path.
    boolean front = ((text1_length + text2_length) % 2 == 1);
    for (int d = 0; d < max_d; d++) {
      // Bail out if timeout reached.
      if (Diff_Timeout > 0 && System.currentTimeMillis() > ms_end) {
        return null;
      }

      // Walk the front path one step.
      v_map1.add(new HashSet<Long>());  // Adds at index 'd'.
      for (int k = -d; k <= d; k += 2) {
        if (k == -d || k != d && v1.get(k - 1) < v1.get(k + 1)) {
          x = v1.get(k + 1);
        } else {
          x = v1.get(k - 1) + 1;
        }
        y = x - k;
        if (doubleEnd) {
          footstep = diff_footprint(x, y);
          if (front && (footsteps.containsKey(footstep))) {
            done = true;
          }
          if (!front) {
            footsteps.put(footstep, d);
          }
        }
        while (!done && x < text1_length && y < text2_length
               && text1.charAt(x) == text2.charAt(y)) {
          x++;
          y++;
          if (doubleEnd) {
            footstep = diff_footprint(x, y);
            if (front && (footsteps.containsKey(footstep))) {
              done = true;
            }
            if (!front) {
              footsteps.put(footstep, d);
            }
          }
        }
        v1.put(k, x);
        v_map1.get(d).add(diff_footprint(x, y));
        if (x == text1_length && y == text2_length) {
          // Reached the end in single-path mode.
          return diff_path1(v_map1, text1, text2);
        } else if (done) {
          // Front path ran over reverse path.
          v_map2 = v_map2.subList(0, footsteps.get(footstep) + 1);
          LinkedList<Diff> a = diff_path1(v_map1, text1.substring(0, x),
                                          text2.substring(0, y));
          a.addAll(diff_path2(v_map2, text1.substring(x), text2.substring(y)));
          return a;
        }
      }
      if (doubleEnd) {
        // Walk the reverse path one step.
        v_map2.add(new HashSet<Long>());  // Adds at index 'd'.
        for (int k = -d; k <= d; k += 2) {
          if (k == -d || k != d && v2.get(k - 1) < v2.get(k + 1)) {
            x = v2.get(k + 1);
          } else {
            x = v2.get(k - 1) + 1;
          }
          y = x - k;
          footstep = diff_footprint(text1_length - x, text2_length - y);
          if (!front && (footsteps.containsKey(footstep))) {
            done = true;
          }
          if (front) {
            footsteps.put(footstep, d);
          }
          while (!done && x < text1_length && y < text2_length
                 && text1.charAt(text1_length - x - 1)
                 == text2.charAt(text2_length - y - 1)) {
            x++;
            y++;
            footstep = diff_footprint(text1_length - x, text2_length - y);
            if (!front && (footsteps.containsKey(footstep))) {
              done = true;
            }
            if (front) {
              footsteps.put(footstep, d);
            }
          }
          v2.put(k, x);
          v_map2.get(d).add(diff_footprint(x, y));
          if (done) {
            // Reverse path ran over front path.
            v_map1 = v_map1.subList(0, footsteps.get(footstep) + 1);
            LinkedList<Diff> a
                = diff_path1(v_map1, text1.substring(0, text1_length - x),
                             text2.substring(0, text2_length - y));
            a.addAll(diff_path2(v_map2, text1.substring(text1_length - x),
                                text2.substring(text2_length - y)));
            return a;
          }
        }
      }
    }
    // Number of diffs equals number of characters, no commonality at all.
    return null;
  }
  /**
   * Work from the middle back to the start to determine the path.
   * @param v_map List of path sets.
   * @param text1 Old string fragment to be diffed.
   * @param text2 New string fragment to be diffed.
   * @return LinkedList of Diff objects.
   */
  protected LinkedList<Diff> diff_path1(List<Set<Long>> v_map,
                                        String text1, String text2) {
    LinkedList<Diff> path = new LinkedList<Diff>();
    int x = text1.length();
    int y = text2.length();
    Operation last_op = null;
    for (int d = v_map.size() - 2; d >= 0; d--) {
      while (true) {
        if (v_map.get(d).contains(diff_footprint(x - 1, y))) {
          x--;
          if (last_op == Operation.DELETE) {
            path.getFirst().text = text1.charAt(x) + path.getFirst().text;
          } else {
            path.addFirst(new Diff(Operation.DELETE,
                                   text1.substring(x, x + 1)));
          }
          last_op = Operation.DELETE;
          break;
        } else if (v_map.get(d).contains(diff_footprint(x, y - 1))) {
          y--;
          if (last_op == Operation.INSERT) {
            path.getFirst().text = text2.charAt(y) + path.getFirst().text;
          } else {
            path.addFirst(new Diff(Operation.INSERT,
                                   text2.substring(y, y + 1)));
          }
          last_op = Operation.INSERT;
          break;
        } else {
          x--;
          y--;
          assert (text1.charAt(x) == text2.charAt(y))
                 : "No diagonal.  Can't happen. (diff_path1)";
          if (last_op == Operation.EQUAL) {
            path.getFirst().text = text1.charAt(x) + path.getFirst().text;
          } else {
            path.addFirst(new Diff(Operation.EQUAL, text1.substring(x, x + 1)));
          }
          last_op = Operation.EQUAL;
        }
      }
    }
    return path;
  }
  /**
   * Work from the middle back to the end to determine the path.
   * @param v_map List of path sets.
   * @param text1 Old string fragment to be diffed.
   * @param text2 New string fragment to be diffed.
   * @return LinkedList of Diff objects.
   */
  protected LinkedList<Diff> diff_path2(List<Set<Long>> v_map,
                                        String text1, String text2) {
    LinkedList<Diff> path = new LinkedList<Diff>();
    int x = text1.length();
    int y = text2.length();
    Operation last_op = null;
    for (int d = v_map.size() - 2; d >= 0; d--) {
      while (true) {
        if (v_map.get(d).contains(diff_footprint(x - 1, y))) {
          x--;
          if (last_op == Operation.DELETE) {
            path.getLast().text += text1.charAt(text1.length() - x - 1);
          } else {
            path.addLast(new Diff(Operation.DELETE,
                text1.substring(text1.length() - x - 1, text1.length() - x)));
          }
          last_op = Operation.DELETE;
          break;
        } else if (v_map.get(d).contains(diff_footprint(x, y - 1))) {
          y--;
          if (last_op == Operation.INSERT) {
            path.getLast().text += text2.charAt(text2.length() - y - 1);
          } else {
            path.addLast(new Diff(Operation.INSERT,
                text2.substring(text2.length() - y - 1, text2.length() - y)));
          }
          last_op = Operation.INSERT;
          break;
        } else {
          x--;
          y--;
          assert (text1.
最低0.47元/天解锁文章
samir张三
关注
2
点赞
踩
4

收藏

觉得还不错? 一键收藏
2
评论
docx4j 对比word

在java对比两份word文档（包括表格里面的数据，表格里的数据和段落里的数据需要分别处理，详情看后面代码），增加批注显示差异。效果图如下需引入docx4j jar ，版本可拿最新稳定版。maven 引入 &lt;dependency&gt; &lt;groupId&gt;org.docx4j&lt;/groupId&gt; &lt;artifactId...
复制链接

扫一扫
专栏目录