在java对比两份word文档(包括表格里面的数据,表格里的数据和段落里的数据需要分别处理,详情看后面代码),增加批注显示差异。
效果图如下
需引入docx4j jar ,版本可拿最新稳定版。
maven 引入
<dependency>
<groupId>org.docx4j</groupId>
<artifactId>docx4j</artifactId>
<version>3.3.7</version>
</dependency>
gradle 引入
compile group: 'org.docx4j', name: 'docx4j', version: '3.3.7'
直接上代码
package com.example.word;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Diff_match_patch {
// Defaults.
// Set these on your diff_match_patch instance to override the defaults.
/**
* Number of seconds to map a diff before giving up (0 for infinity).
*/
public float Diff_Timeout = 1.0f;
/**
* Cost of an empty edit operation in terms of edit characters.
*/
public short Diff_EditCost = 4;
/**
* The size beyond which the double-ended diff activates.
* Double-ending is twice as fast, but less accurate.
*/
public short Diff_DualThreshold = 32;
/**
* At what point is no match declared (0.0 = perfection, 1.0 = very loose).
*/
public float Match_Threshold = 0.5f;
/**
* How far to search for a match (0 = exact location, 1000+ = broad match).
* A match this many characters away from the expected location will add
* 1.0 to the score (0.0 is a perfect match).
*/
public int Match_Distance = 1000;
/**
* When deleting a large block of text (over ~64 characters), how close does
* the contents have to match the expected contents. (0.0 = perfection,
* 1.0 = very loose). Note that Match_Threshold controls how closely the
* end points of a delete need to match.
*/
public float Patch_DeleteThreshold = 0.5f;
/**
* Chunk size for context length.
*/
public short Patch_Margin = 4;
/**
* The number of bits in an int.
*/
private int Match_MaxBits = 32;
/**
* Internal class for returning results from diff_linesToChars().
* Other less paranoid languages just use a three-element array.
*/
protected static class LinesToCharsResult {
protected String chars1;
protected String chars2;
protected List<String> lineArray;
protected LinesToCharsResult(String chars1, String chars2,
List<String> lineArray) {
this.chars1 = chars1;
this.chars2 = chars2;
this.lineArray = lineArray;
}
}
// DIFF FUNCTIONS
/**
* The data structure representing a diff is a Linked list of Diff objects:
* {Diff(Operation.DELETE, "Hello"), Diff(Operation.INSERT, "Goodbye"),
* Diff(Operation.EQUAL, " world.")}
* which means: delete "Hello", add "Goodbye" and keep " world."
*/
public enum Operation {
DELETE, INSERT, EQUAL
}
/**
* Find the differences between two texts.
* Run a faster slightly less optimal diff
* This method allows the 'checklines' of diff_main() to be optional.
* Most of the time checklines is wanted, so default to true.
* @param text1 Old string to be diffed.
* @param text2 New string to be diffed.
* @return Linked List of Diff objects.
*/
public LinkedList<Diff> diff_main(String text1, String text2) {
return diff_main(text1, text2, true);
}
/**
* Find the differences between two texts. Simplifies the problem by
* stripping any common prefix or suffix off the texts before diffing.
* @param text1 Old string to be diffed.
* @param text2 New string to be diffed.
* @param checklines Speedup flag. If false, then don't run a
* line-level diff first to identify the changed areas.
* If true, then run a faster slightly less optimal diff
* @return Linked List of Diff objects.
*/
public LinkedList<Diff> diff_main(String text1, String text2, boolean checklines) {
// Check for equality (speedup)
LinkedList<Diff> diffs;
if (text1.equals(text2)) {
diffs = new LinkedList<Diff>();
diffs.add(new Diff(Operation.EQUAL, text1));
return diffs;
}
// Trim off common prefix (speedup)
int commonlength = diff_commonPrefix(text1, text2);
String commonprefix = text1.substring(0, commonlength);
text1 = text1.substring(commonlength);
text2 = text2.substring(commonlength);
// Trim off common suffix (speedup)
commonlength = diff_commonSuffix(text1, text2);
String commonsuffix = text1.substring(text1.length() - commonlength);
text1 = text1.substring(0, text1.length() - commonlength);
text2 = text2.substring(0, text2.length() - commonlength);
// Compute the diff on the middle block
diffs = diff_compute(text1, text2, checklines);
// Restore the prefix and suffix
if (commonprefix.length() != 0) {
diffs.addFirst(new Diff(Operation.EQUAL, commonprefix));
}
if (commonsuffix.length() != 0) {
diffs.addLast(new Diff(Operation.EQUAL, commonsuffix));
}
diff_cleanupMerge(diffs);
return diffs;
}
/**
* Find the differences between two texts. Assumes that the texts do not
* have any common prefix or suffix.
* @param text1 Old string to be diffed.
* @param text2 New string to be diffed.
* @param checklines Speedup flag. If false, then don't run a
* line-level diff first to identify the changed areas.
* If true, then run a faster slightly less optimal diff
* @return Linked List of Diff objects.
*/
protected LinkedList<Diff> diff_compute(String text1, String text2,boolean checklines) {
LinkedList<Diff> diffs = new LinkedList<Diff>();
if (text1.length() == 0) {
// Just add some text (speedup)
diffs.add(new Diff(Operation.INSERT, text2));
return diffs;
}
if (text2.length() == 0) {
// Just delete some text (speedup)
diffs.add(new Diff(Operation.DELETE, text1));
return diffs;
}
String longtext = text1.length() > text2.length() ? text1 : text2;
String shorttext = text1.length() > text2.length() ? text2 : text1;
int i = longtext.indexOf(shorttext);
if (i != -1) {
// Shorter text is inside the longer text (speedup)
Operation op = (text1.length() > text2.length()) ?
Operation.DELETE : Operation.INSERT;
diffs.add(new Diff(op, longtext.substring(0, i)));
diffs.add(new Diff(Operation.EQUAL, shorttext));
diffs.add(new Diff(op, longtext.substring(i + shorttext.length())));
return diffs;
}
longtext = shorttext = null; // Garbage collect.
// Check to see if the problem can be split in two.
String[] hm = diff_halfMatch(text1, text2);
if (hm != null) {
// A half-match was found, sort out the return data.
String text1_a = hm[0];
String text1_b = hm[1];
String text2_a = hm[2];
String text2_b = hm[3];
String mid_common = hm[4];
// Send both pairs off for separate processing.
LinkedList<Diff> diffs_a = diff_main(text1_a, text2_a, checklines);
LinkedList<Diff> diffs_b = diff_main(text1_b, text2_b, checklines);
// Merge the results.
diffs = diffs_a;
diffs.add(new Diff(Operation.EQUAL, mid_common));
diffs.addAll(diffs_b);
return diffs;
}
// Perform a real diff.
if (checklines && (text1.length() < 100 || text2.length() < 100)) {
checklines = false; // Too trivial for the overhead.
}
List<String> linearray = null;
if (checklines) {
// Scan the text on a line-by-line basis first.
LinesToCharsResult b = diff_linesToChars(text1, text2);
text1 = b.chars1;
text2 = b.chars2;
linearray = b.lineArray;
}
diffs = diff_map(text1, text2);
if (diffs == null) {
// No acceptable result.
diffs = new LinkedList<Diff>();
diffs.add(new Diff(Operation.DELETE, text1));
diffs.add(new Diff(Operation.INSERT, text2));
}
if (checklines) {
// Convert the diff back to original text.
diff_charsToLines(diffs, linearray);
// Eliminate freak matches (e.g. blank lines)
diff_cleanupSemantic(diffs);
// Rediff any replacement blocks, this time character-by-character.
// Add a dummy entry at the end.
diffs.add(new Diff(Operation.EQUAL, ""));
int count_delete = 0;
int count_insert = 0;
String text_delete = "";
String text_insert = "";
ListIterator<Diff> pointer = diffs.listIterator();
Diff thisDiff = pointer.next();
while (thisDiff != null) {
switch (thisDiff.operation) {
case INSERT:
count_insert++;
text_insert += thisDiff.text;
break;
case DELETE:
count_delete++;
text_delete += thisDiff.text;
break;
case EQUAL:
// Upon reaching an equality, check for prior redundancies.
if (count_delete >= 1 && count_insert >= 1) {
// Delete the offending records and add the merged ones.
pointer.previous();
for (int j = 0; j < count_delete + count_insert; j++) {
pointer.previous();
pointer.remove();
}
for (Diff newDiff : diff_main(text_delete, text_insert, false)) {
pointer.add(newDiff);
}
}
count_insert = 0;
count_delete = 0;
text_delete = "";
text_insert = "";
break;
}
thisDiff = pointer.hasNext() ? pointer.next() : null;
}
diffs.removeLast(); // Remove the dummy entry at the end.
}
return diffs;
}
/**
* Split two texts into a list of strings. Reduce the texts to a string of
* hashes where each Unicode character represents one line.
* @param text1 First string.
* @param text2 Second string.
* @return An object containing the encoded text1, the encoded text2 and
* the List of unique strings. The zeroth element of the List of
* unique strings is intentionally blank.
*/
protected LinesToCharsResult diff_linesToChars(String text1, String text2) {
List<String> lineArray = new ArrayList<String>();
Map<String, Integer> lineHash = new HashMap<String, Integer>();
// e.g. linearray[4] == "Hello\n"
// e.g. linehash.get("Hello\n") == 4
// "\x00" is a valid character, but various debuggers don't like it.
// So we'll insert a junk entry to avoid generating a null character.
lineArray.add("");
String chars1 = diff_linesToCharsMunge(text1, lineArray, lineHash);
String chars2 = diff_linesToCharsMunge(text2, lineArray, lineHash);
return new LinesToCharsResult(chars1, chars2, lineArray);
}
/**
* Split a text into a list of strings. Reduce the texts to a string of
* hashes where each Unicode character represents one line.
* @param text String to encode.
* @param lineArray List of unique strings.
* @param lineHash Map of strings to indices.
* @return Encoded string.
*/
private String diff_linesToCharsMunge(String text, List<String> lineArray,
Map<String, Integer> lineHash) {
int lineStart = 0;
int lineEnd = -1;
String line;
StringBuilder chars = new StringBuilder();
// Walk the text, pulling out a substring for each line.
// text.split('\n') would would temporarily double our memory footprint.
// Modifying text would create many large strings to garbage collect.
while (lineEnd < text.length() - 1) {
lineEnd = text.indexOf('\n', lineStart);
if (lineEnd == -1) {
lineEnd = text.length() - 1;
}
line = text.substring(lineStart, lineEnd + 1);
lineStart = lineEnd + 1;
if (lineHash.containsKey(line)) {
chars.append(String.valueOf((char) (int) lineHash.get(line)));
} else {
lineArray.add(line);
lineHash.put(line, lineArray.size() - 1);
chars.append(String.valueOf((char) (lineArray.size() - 1)));
}
}
return chars.toString();
}
/**
* Rehydrate the text in a diff from a string of line hashes to real lines of
* text.
* @param diffs LinkedList of Diff objects.
* @param lineArray List of unique strings.
*/
protected void diff_charsToLines(LinkedList<Diff> diffs,
List<String> lineArray) {
StringBuilder text;
for (Diff diff : diffs) {
text = new StringBuilder();
for (int y = 0; y < diff.text.length(); y++) {
text.append(lineArray.get(diff.text.charAt(y)));
}
diff.text = text.toString();
}
}
/**
* Explore the intersection points between the two texts.
* @param text1 Old string to be diffed.
* @param text2 New string to be diffed.
* @return LinkedList of Diff objects or null if no diff available.
*/
protected LinkedList<Diff> diff_map(String text1, String text2) {
long ms_end = System.currentTimeMillis() + (long) (Diff_Timeout * 1000);
// Cache the text lengths to prevent multiple calls.
int text1_length = text1.length();
int text2_length = text2.length();
int max_d = text1_length + text2_length - 1;
boolean doubleEnd = Diff_DualThreshold * 2 < max_d;
List<Set<Long>> v_map1 = new ArrayList<Set<Long>>();
List<Set<Long>> v_map2 = new ArrayList<Set<Long>>();
Map<Integer, Integer> v1 = new HashMap<Integer, Integer>();
Map<Integer, Integer> v2 = new HashMap<Integer, Integer>();
v1.put(1, 0);
v2.put(1, 0);
int x, y;
Long footstep = 0L; // Used to track overlapping paths.
Map<Long, Integer> footsteps = new HashMap<Long, Integer>();
boolean done = false;
// If the total number of characters is odd, then the front path will
// collide with the reverse path.
boolean front = ((text1_length + text2_length) % 2 == 1);
for (int d = 0; d < max_d; d++) {
// Bail out if timeout reached.
if (Diff_Timeout > 0 && System.currentTimeMillis() > ms_end) {
return null;
}
// Walk the front path one step.
v_map1.add(new HashSet<Long>()); // Adds at index 'd'.
for (int k = -d; k <= d; k += 2) {
if (k == -d || k != d && v1.get(k - 1) < v1.get(k + 1)) {
x = v1.get(k + 1);
} else {
x = v1.get(k - 1) + 1;
}
y = x - k;
if (doubleEnd) {
footstep = diff_footprint(x, y);
if (front && (footsteps.containsKey(footstep))) {
done = true;
}
if (!front) {
footsteps.put(footstep, d);
}
}
while (!done && x < text1_length && y < text2_length
&& text1.charAt(x) == text2.charAt(y)) {
x++;
y++;
if (doubleEnd) {
footstep = diff_footprint(x, y);
if (front && (footsteps.containsKey(footstep))) {
done = true;
}
if (!front) {
footsteps.put(footstep, d);
}
}
}
v1.put(k, x);
v_map1.get(d).add(diff_footprint(x, y));
if (x == text1_length && y == text2_length) {
// Reached the end in single-path mode.
return diff_path1(v_map1, text1, text2);
} else if (done) {
// Front path ran over reverse path.
v_map2 = v_map2.subList(0, footsteps.get(footstep) + 1);
LinkedList<Diff> a = diff_path1(v_map1, text1.substring(0, x),
text2.substring(0, y));
a.addAll(diff_path2(v_map2, text1.substring(x), text2.substring(y)));
return a;
}
}
if (doubleEnd) {
// Walk the reverse path one step.
v_map2.add(new HashSet<Long>()); // Adds at index 'd'.
for (int k = -d; k <= d; k += 2) {
if (k == -d || k != d && v2.get(k - 1) < v2.get(k + 1)) {
x = v2.get(k + 1);
} else {
x = v2.get(k - 1) + 1;
}
y = x - k;
footstep = diff_footprint(text1_length - x, text2_length - y);
if (!front && (footsteps.containsKey(footstep))) {
done = true;
}
if (front) {
footsteps.put(footstep, d);
}
while (!done && x < text1_length && y < text2_length
&& text1.charAt(text1_length - x - 1)
== text2.charAt(text2_length - y - 1)) {
x++;
y++;
footstep = diff_footprint(text1_length - x, text2_length - y);
if (!front && (footsteps.containsKey(footstep))) {
done = true;
}
if (front) {
footsteps.put(footstep, d);
}
}
v2.put(k, x);
v_map2.get(d).add(diff_footprint(x, y));
if (done) {
// Reverse path ran over front path.
v_map1 = v_map1.subList(0, footsteps.get(footstep) + 1);
LinkedList<Diff> a
= diff_path1(v_map1, text1.substring(0, text1_length - x),
text2.substring(0, text2_length - y));
a.addAll(diff_path2(v_map2, text1.substring(text1_length - x),
text2.substring(text2_length - y)));
return a;
}
}
}
}
// Number of diffs equals number of characters, no commonality at all.
return null;
}
/**
* Work from the middle back to the start to determine the path.
* @param v_map List of path sets.
* @param text1 Old string fragment to be diffed.
* @param text2 New string fragment to be diffed.
* @return LinkedList of Diff objects.
*/
protected LinkedList<Diff> diff_path1(List<Set<Long>> v_map,
String text1, String text2) {
LinkedList<Diff> path = new LinkedList<Diff>();
int x = text1.length();
int y = text2.length();
Operation last_op = null;
for (int d = v_map.size() - 2; d >= 0; d--) {
while (true) {
if (v_map.get(d).contains(diff_footprint(x - 1, y))) {
x--;
if (last_op == Operation.DELETE) {
path.getFirst().text = text1.charAt(x) + path.getFirst().text;
} else {
path.addFirst(new Diff(Operation.DELETE,
text1.substring(x, x + 1)));
}
last_op = Operation.DELETE;
break;
} else if (v_map.get(d).contains(diff_footprint(x, y - 1))) {
y--;
if (last_op == Operation.INSERT) {
path.getFirst().text = text2.charAt(y) + path.getFirst().text;
} else {
path.addFirst(new Diff(Operation.INSERT,
text2.substring(y, y + 1)));
}
last_op = Operation.INSERT;
break;
} else {
x--;
y--;
assert (text1.charAt(x) == text2.charAt(y))
: "No diagonal. Can't happen. (diff_path1)";
if (last_op == Operation.EQUAL) {
path.getFirst().text = text1.charAt(x) + path.getFirst().text;
} else {
path.addFirst(new Diff(Operation.EQUAL, text1.substring(x, x + 1)));
}
last_op = Operation.EQUAL;
}
}
}
return path;
}
/**
* Work from the middle back to the end to determine the path.
* @param v_map List of path sets.
* @param text1 Old string fragment to be diffed.
* @param text2 New string fragment to be diffed.
* @return LinkedList of Diff objects.
*/
protected LinkedList<Diff> diff_path2(List<Set<Long>> v_map,
String text1, String text2) {
LinkedList<Diff> path = new LinkedList<Diff>();
int x = text1.length();
int y = text2.length();
Operation last_op = null;
for (int d = v_map.size() - 2; d >= 0; d--) {
while (true) {
if (v_map.get(d).contains(diff_footprint(x - 1, y))) {
x--;
if (last_op == Operation.DELETE) {
path.getLast().text += text1.charAt(text1.length() - x - 1);
} else {
path.addLast(new Diff(Operation.DELETE,
text1.substring(text1.length() - x - 1, text1.length() - x)));
}
last_op = Operation.DELETE;
break;
} else if (v_map.get(d).contains(diff_footprint(x, y - 1))) {
y--;
if (last_op == Operation.INSERT) {
path.getLast().text += text2.charAt(text2.length() - y - 1);
} else {
path.addLast(new Diff(Operation.INSERT,
text2.substring(text2.length() - y - 1, text2.length() - y)));
}
last_op = Operation.INSERT;
break;
} else {
x--;
y--;
assert (text1.