使用 java-diff-utils、pdfbox 完成PDF内容的比较以及差异处的高亮显示
文章仅对PDF文件中的文本内容进行对比,并在差异处高亮显示差异内容。新增文本为绿色、删除为红色。
在 DiffUtil 的 getDiffPdf 方法传入需要对比的文件路径,以及输出的文件路径。
(此工具类仅进行文本内容的对比,无法进行图片的比较,请注意甄别。)
spingboot版本 2.7.1
jdk 1.8
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>io.github.java-diff-utils</groupId>
<artifactId>java-diff-utils</artifactId>
<version>4.11</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.29</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox-tools</artifactId>
<version>2.0.29</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
<version>2.0.29</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>xmpbox</artifactId>
<version>2.0.29</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>preflight</artifactId>
<version>2.0.29</version>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.6.0</version>
</dependency>
</dependencies>
引入以下三个类即可
工具类DiffUtil
package com.pdf.pdfcomparer.util;
import cn.hutool.core.collection.CollUtil;
import com.github.difflib.UnifiedDiffUtils;
import com.github.difflib.patch.Patch;
import com.pdf.pdfcomparer.overide.GetCharLocationAndSize;
import com.pdf.pdfcomparer.overide.LineTextPosition;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;
import java.io.File;
import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
public class DiffUtil {
/**
* 获取文本对比后的pdf
* @param oldPdfPath 原pdf文件路径
* @param newPdfPath 新pdf文件路径
* @param outputOldPdfPath 原diff文件输出路径
* @param outputNewPdfPath 新diff文件输出路径
*/
public static int getDiffPdf(String oldPdfPath, String newPdfPath, String outputOldPdfPath, String outputNewPdfPath) {
try {
File file1 = new File(oldPdfPath);
File file2 = new File(newPdfPath);
String originalFileName = file1.getName();
String revisedFileName = file2.getName();
// 构建文本页信息
List<GetCharLocationAndSize> charLAS1 = buildCharLocationAndSize(file1);
List<GetCharLocationAndSize> charLAS2 = buildCharLocationAndSize(file2);
// 获取所有文本详细信息
List<LineTextPosition> lineTextPositions1 = new ArrayList<>();
charLAS1.forEach(getCharLocationAndSize -> lineTextPositions1.addAll(getCharLocationAndSize.getLineTextPositions()));
List<LineTextPosition> lineTextPositions2 = new ArrayList<>();
charLAS2.forEach(getCharLocationAndSize -> lineTextPositions2.addAll(getCharLocationAndSize.getLineTextPositions()));
List<String> original = CollUtil.emptyIfNull(lineTextPositions1).stream().map(LineTextPosition::getLineText).collect(Collectors.toList());
List<String> revised = CollUtil.emptyIfNull(lineTextPositions2).stream().map(LineTextPosition::getLineText).collect(Collectors.toList());
// 获取差异文本上下文信息
List<String> unifiedDiff = getUnifiedDiff(originalFileName, original, revisedFileName, revised);
// 设置文本差异标志
List<String> signs = CollUtil.emptyIfNull(unifiedDiff).stream().filter(sign -> sign.startsWith("@@")).collect(Collectors.toList());
if (signs.size() > 0) {
signs.forEach(sign -> {
if (!sign.equals("@@ -0,0 +0,0 @@")) {
List<String> split = Arrays.asList(sign.split(" "));
// 源文件标识
String var1 = split.get(1);
String[] var2 = var1.split(",");
int start = Integer.parseInt(var2[0].substring(1)) - 1;
int end = start + Integer.parseInt(var2[1]);
for (int i = start; i < end; i++) {
lineTextPositions1.get(i).setType("1");
}
// 新文件标识
String var3 = split.get(2);
String[] var4 = var3.split(",");
int start1 = Integer.parseInt(var4[0].substring(1)) - 1;
int end1 = start1 + Integer.parseInt(var4[1]);
for (int i = start1; i < end1; i++) {
lineTextPositions2.get(i).setType("2");
}
}
});
}
// 输出差异文件
PDDocument pdDocument1 = PDDocument.load(file1);
// 突出显示删除
for (LineTextPosition lineTextPosition : lineTextPositions1) {
if (lineTextPosition.getType().equals("1")) {
PDPage page = pdDocument1.getPage(lineTextPosition.getPageNum() - 1);
PDAnnotationTextMarkup markup = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
markup.setColor(new PDColor(new float[] { 1, 0, 0}, PDDeviceRGB.INSTANCE));
float x = lineTextPosition.getX();
float y = lineTextPosition.getY();
float width = lineTextPosition.getWidth();
float height = lineTextPosition.getHeight();
PDRectangle bounds = new PDRectangle(x, y - (height / 2), width, height * 2);
markup.setRectangle(bounds);
// 从左下角计算 依次为左下、右下、左上、右上
float []p=pDRectangle2QuadPoints(bounds);
markup.setQuadPoints(p);
page.getAnnotations().add(markup);
}
}
pdDocument1.save(outputOldPdfPath);
pdDocument1.close();
PDDocument pdDocument2 = PDDocument.load(file2);
// 突出显示删除
for (LineTextPosition lineTextPosition : lineTextPositions2) {
if (lineTextPosition.getType().equals("2")) {
PDPage page = pdDocument2.getPage(lineTextPosition.getPageNum() - 1);
PDAnnotationTextMarkup markup = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
markup.setColor(new PDColor(new float[] { 0, 255, 0}, PDDeviceRGB.INSTANCE));
float x = lineTextPosition.getX();
float y = lineTextPosition.getY();
float width = lineTextPosition.getWidth();
float height = lineTextPosition.getHeight();
PDRectangle bounds = new PDRectangle(x, y - (height / 2), width, height * 2);
markup.setRectangle(bounds);
// 从左下角计算 依次为左下、右下、左上、右上
float []p=pDRectangle2QuadPoints(bounds);
markup.setQuadPoints(p);
page.getAnnotations().add(markup);
}
}
pdDocument2.save(outputNewPdfPath);
pdDocument2.close();
} catch (IOException e) {
return 1;
}
return 0;
}
/**
* 获取上下文差异描述信息
* @param originalFileName 源文件名称
* @param original 源文件文本信息
* @param revisedFileName 新文件名称
* @param revised 新文件文本信息
* @return
*/
public static List<String> getUnifiedDiff(String originalFileName, List<String> original, String revisedFileName, List<String> revised) {
Patch<String> patch = com.github.difflib.DiffUtils.diff(original, revised);
List<String> unifiedDiff = UnifiedDiffUtils.generateUnifiedDiff(originalFileName, revisedFileName, original, patch, 0);
int diffCount = unifiedDiff.size();
if (diffCount == 0) {
//如果两文件没差异则插入如下
unifiedDiff.add("--- " + originalFileName);
unifiedDiff.add("+++ " + revisedFileName);
unifiedDiff.add("@@ -0,0 +0,0 @@");
} else if (diffCount >= 3 && !unifiedDiff.get(2).contains("@@ -1,")) {
// 如果至少有一处变化,并且变化不在第一行
unifiedDiff.set(1, unifiedDiff.get(1));
//如果第一行没变化则插入@@ -0,0 +0,0 @@
unifiedDiff.add(2, "@@ -0,0 +0,0 @@");
}
return unifiedDiff;
}
// 矩阵置换坐标
private static float[] pDRectangle2QuadPoints(PDRectangle bounds) {
float []p=new float[8];
p[0]=bounds.getLowerLeftX();
p[1]=bounds.getLowerLeftY();
p[2]=bounds.getUpperRightX();
p[3]=bounds.getLowerLeftY();
p[4]=bounds.getLowerLeftX();
p[5]=bounds.getUpperRightY();
p[6]=bounds.getUpperRightX();
p[7]=bounds.getUpperRightY();
return p;
}
/**
* 构建PDFTextStripper
* @param file
* @return
*/
private static List<GetCharLocationAndSize> buildCharLocationAndSize(File file) {
List<GetCharLocationAndSize> pdfTextStripperList;
PDDocument pdDocument = null;
try {
pdDocument = PDDocument.load(file);
int pageCount = pdDocument.getNumberOfPages();
pdfTextStripperList = new ArrayList<>(pageCount);
for (int i = 1; i <= pageCount; i++) {
GetCharLocationAndSize pdfTextStripper = new GetCharLocationAndSize();
pdfTextStripper.setPageNum(i);
pdfTextStripper.setSortByPosition(true);
pdfTextStripper.setStartPage(i);
pdfTextStripper.setEndPage(i);
StringWriter writer = new StringWriter();
pdfTextStripper.writeText(pdDocument, writer);
pdfTextStripperList.add(pdfTextStripper);
}
} catch (Exception e) {
return null;
} finally {
if (pdDocument != null) {
try {
pdDocument.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return pdfTextStripperList;
}
}
工具类中用到的其他自定义的类
GetCharLocationAndSize.class 、 LineTextPosition.class
package com.pdf.pdfcomparer.overide;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* 每页pdf对应一个实例
*/
public class GetCharLocationAndSize extends PDFTextStripper {
private final List<LineTextPosition> lineTextPositions = new ArrayList<>();
private int pageNum = 0;
public GetCharLocationAndSize() throws IOException {
}
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
// 记录行文本和每个字符位置信息
LineTextPosition lineTextPosition = new LineTextPosition();
lineTextPosition.setLineText(text);
// 计算本段文字矩阵参数,距离页左上角的坐标
TextPosition textPosition = textPositions.get(0);
float pageHeight = textPosition.getPageHeight();
float x = textPosition.getXDirAdj();
float y = pageHeight - textPosition.getYDirAdj();
float width = 0.0f;
float height = 0.0f;
for (TextPosition textPosition1 : textPositions) {
width += textPosition1.getWidthDirAdj();
if (textPosition1.getHeightDir() > height) {
height = textPosition1.getHeightDir();
}
}
lineTextPosition.setX(x);
lineTextPosition.setY(y);
lineTextPosition.setWidth(width);
lineTextPosition.setHeight(height);
lineTextPosition.setPageNum(pageNum);
lineTextPositions.add(lineTextPosition);
writeString(text);
}
public List<LineTextPosition> getLineTextPositions(){
return lineTextPositions;
}
public int getPageNum() {
return pageNum;
}
public void setPageNum(int pageNum) {
this.pageNum = pageNum;
}
}
package com.pdf.pdfcomparer.overide;
public class LineTextPosition {
/**
* 行文本
*/
private String lineText;
/**
* 文本矩阵左下坐标x
*/
private float x;
/**
* 文本矩阵左下坐标y
*/
private float y;
/**
* 宽度
*/
private float width;
/**
* 高度
*/
private float height;
/**
* 所在页数
*/
private int pageNum;
/**
* 类型 "0":原文 ,"1":删除 ,"2":新增
*/
private String type = "0";
public String getLineText() {
return lineText;
}
public void setLineText(String lineText) {
this.lineText = lineText;
}
public float getX() {
return x;
}
public void setX(float x) {
this.x = x;
}
public float getY() {
return y;
}
public void setY(float y) {
this.y = y;
}
public float getWidth() {
return width;
}
public void setWidth(float width) {
this.width = width;
}
public float getHeight() {
return height;
}
public void setHeight(float height) {
this.height = height;
}
public int getPageNum() {
return pageNum;
}
public void setPageNum(int pageNum) {
this.pageNum = pageNum;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
}