要比较word文档内容,我们需要先读取word文档,这里使用poi库,至于比较内容,可以使用apache的commons-text库
引入依赖
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.1</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.1</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-text</artifactId>
<version>1.11.0</version>
</dependency>
这边要注意下你使用的commons-text的版本,它的api有很大的调整,我使用的版本为1.11.0
实现输出新增和删除内容
你可以使用StringsComparator类来实现文本内容的比较,这里面使用了访问者模式,StringsComparator提供了哪些文本保留了,哪些文本删除了,而由你去提供访问者来实现想要的效果,比如这个例子就是输出新增和删除的内容
效果:
import lombok.AllArgsConstructor;
import lombok.Data;
import org.apache.commons.text.diff.CommandVisitor;
import org.apache.commons.text.diff.EditScript;
import org.apache.commons.text.diff.StringsComparator;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.jupiter.api.Test;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.List;
public class DocTest {
@Test
public void testCompare() {
try {
// 读取word文档
XWPFDocument doc1 = new XWPFDocument(new FileInputStream("D:\\doc\\1.docx"));
XWPFDocument doc2 = new XWPFDocument(new FileInputStream("D:\\doc\\2.docx"));
// 获取文档文本内容
XWPFWordExtractor extractor1 = new XWPFWordExtractor(doc1);
String content1 = extractor1.getText();
XWPFWordExtractor extractor2 = new XWPFWordExtractor(doc2);
String content2 = extractor2.getText();
// 关闭流
doc1.close();
doc2.close();
// commons-text api有很大调整,请注意你使用的版本,我使用的版本为1.11.0
StringsComparator comparator = new StringsComparator(content1, content2);
EditScript<Character> script = comparator.getScript();
ChangedCommandVisitor commandVisitor = new ChangedCommandVisitor();
script.visit(commandVisitor);
commandVisitor.finish();
List<ChangedWords> changedWordsList = commandVisitor.getChangedWordsList();
System.out.println("******变更内容******");
for (int i = 0; i < changedWordsList.size(); i++) {
ChangedWords changedWords = changedWordsList.get(i);
String operator = changedWords.getType() == 0 ? "新增" : "删除";
System.out.println("#" + (i + 1) + operator + ": " + changedWords.getWords());
}
} catch (Exception e) {
e.printStackTrace();
}
}
@Data
@AllArgsConstructor
static class ChangedWords {
private String words;
private int type;//0:insert,1:delete
}
// 获取变化内容
static class ChangedCommandVisitor implements CommandVisitor<Character> {
private List<ChangedWords> changedWordsList = new ArrayList<>();
private StringBuilder temp = new StringBuilder();
private int lastTag = 0; //0:keep,1:insert,2:delete
@Override
public void visitDeleteCommand(Character object) {
if (lastTag == 1) {
changedWordsList.add(new ChangedWords(temp.toString(), 0));
temp.setLength(0);
}
lastTag = 2;
temp.append(object);
}
@Override
public void visitInsertCommand(Character object) {
if (lastTag == 2) {
changedWordsList.add(new ChangedWords(temp.toString(), 1));
temp.setLength(0);
}
lastTag = 1;
temp.append(object);
}
@Override
public void visitKeepCommand(Character object) {
finish();
}
public void finish() {
if (lastTag == 1) {
changedWordsList.add(new ChangedWords(temp.toString(), 0));
temp.setLength(0);
} else if (lastTag == 2) {
changedWordsList.add(new ChangedWords(temp.toString(), 1));
temp.setLength(0);
}
lastTag = 0;
}
public List<ChangedWords> getChangedWordsList() {
return changedWordsList;
}
}
}
实现在源文本上标记修改
输出的内容是html,可直接在网页里面显示,自己加点样式就可以实现不同的显示效果
效果:
package com.wkt.server;
import lombok.AllArgsConstructor;
import lombok.Data;
import org.apache.commons.text.diff.CommandVisitor;
import org.apache.commons.text.diff.EditScript;
import org.apache.commons.text.diff.StringsComparator;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.jupiter.api.Test;
import java.io.FileInputStream;
import java.util.ArrayList;
import java.util.List;
public class DocTest {
@Test
public void testCompare() {
try {
// 读取word文档
XWPFDocument doc1 = new XWPFDocument(new FileInputStream("D:\\doc\\1.docx"));
XWPFDocument doc2 = new XWPFDocument(new FileInputStream("D:\\doc\\2.docx"));
// 获取文档文本内容
XWPFWordExtractor extractor1 = new XWPFWordExtractor(doc1);
String content1 = extractor1.getText();
XWPFWordExtractor extractor2 = new XWPFWordExtractor(doc2);
String content2 = extractor2.getText();
// 关闭流
doc1.close();
doc2.close();
// commons-text api有很大调整,请注意你使用的版本,我使用的版本为1.11.0
StringsComparator comparator = new StringsComparator(content1, content2);
EditScript<Character> script = comparator.getScript();
TextChangedCommandVisitor commandVisitor = new TextChangedCommandVisitor();
script.visit(commandVisitor);
commandVisitor.finish();
System.out.println(commandVisitor.getContent());
} catch (Exception e) {
e.printStackTrace();
}
}
// 源文本上显示变化内容
static class TextChangedCommandVisitor implements CommandVisitor<Character> {
private StringBuilder content = new StringBuilder();
private int lastTag = 0; //0:keep,1:insert,2:delete
private String insertStart = "<em>";
private String insertEnd = "</em>";
private String deleteStart = "<del>";
private String deleteEnd = "</del>";
@Override
public void visitDeleteCommand(Character object) {
if (lastTag == 1) {
content.append(insertEnd);
content.append(deleteStart);
} else if (lastTag == 0) {
content.append(deleteStart);
}
content.append(object);
lastTag = 2;
}
@Override
public void visitInsertCommand(Character object) {
if (lastTag == 2) {
content.append(deleteEnd);
content.append(insertStart);
} else if (lastTag == 0) {
content.append(insertStart);
}
content.append(object);
lastTag = 1;
}
@Override
public void visitKeepCommand(Character object) {
finish();
content.append(object);
}
public void finish() {
if (lastTag == 1) {
content.append(insertEnd);
} else if (lastTag == 2) {
content.append(deleteEnd);
}
lastTag = 0;
}
public StringBuilder getContent() {
return content;
}
}
}