Java对比PDF文件内容

使用 java-diff-utils、pdfbox 完成PDF内容的比较以及差异处的高亮显示

文章仅对PDF文件中的文本内容进行对比,并在差异处高亮显示差异内容。新增文本为绿色、删除为红色。

在 DiffUtil 的 getDiffPdf 方法传入需要对比的文件路径,以及输出的文件路径。

(此工具类仅进行文本内容的对比,无法进行图片的比较,请注意甄别。)

spingboot版本 2.7.1

jdk 1.8

<dependencies>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>

        <dependency>
            <groupId>io.github.java-diff-utils</groupId>
            <artifactId>java-diff-utils</artifactId>
            <version>4.11</version>
        </dependency>

        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.29</version>
        </dependency>

        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox-tools</artifactId>
            <version>2.0.29</version>
        </dependency>

        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>fontbox</artifactId>
            <version>2.0.29</version>
        </dependency>

        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>xmpbox</artifactId>
            <version>2.0.29</version>
        </dependency>

        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>preflight</artifactId>
            <version>2.0.29</version>
        </dependency>

        <dependency>
            <groupId>cn.hutool</groupId>
            <artifactId>hutool-all</artifactId>
            <version>5.6.0</version>
        </dependency>
        
    </dependencies>

引入以下三个类即可

工具类DiffUtil

package com.pdf.pdfcomparer.util;

import cn.hutool.core.collection.CollUtil;
import com.github.difflib.UnifiedDiffUtils;
import com.github.difflib.patch.Patch;
import com.pdf.pdfcomparer.overide.GetCharLocationAndSize;
import com.pdf.pdfcomparer.overide.LineTextPosition;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup;

import java.io.File;
import java.io.IOException;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;


public class DiffUtil {

    /**
     * 获取文本对比后的pdf
     * @param oldPdfPath 原pdf文件路径
     * @param newPdfPath 新pdf文件路径
     * @param outputOldPdfPath 原diff文件输出路径
     * @param outputNewPdfPath 新diff文件输出路径
     */
    public static int getDiffPdf(String oldPdfPath, String newPdfPath, String outputOldPdfPath, String outputNewPdfPath) {
        try {
            File file1 = new File(oldPdfPath);
            File file2 = new File(newPdfPath);
            String originalFileName = file1.getName();
            String revisedFileName = file2.getName();

            // 构建文本页信息
            List<GetCharLocationAndSize> charLAS1 = buildCharLocationAndSize(file1);
            List<GetCharLocationAndSize> charLAS2 = buildCharLocationAndSize(file2);
            // 获取所有文本详细信息
            List<LineTextPosition> lineTextPositions1 = new ArrayList<>();
            charLAS1.forEach(getCharLocationAndSize -> lineTextPositions1.addAll(getCharLocationAndSize.getLineTextPositions()));
            List<LineTextPosition> lineTextPositions2 = new ArrayList<>();
            charLAS2.forEach(getCharLocationAndSize -> lineTextPositions2.addAll(getCharLocationAndSize.getLineTextPositions()));
            List<String> original = CollUtil.emptyIfNull(lineTextPositions1).stream().map(LineTextPosition::getLineText).collect(Collectors.toList());
            List<String> revised = CollUtil.emptyIfNull(lineTextPositions2).stream().map(LineTextPosition::getLineText).collect(Collectors.toList());
            // 获取差异文本上下文信息
            List<String> unifiedDiff = getUnifiedDiff(originalFileName, original, revisedFileName, revised);
            // 设置文本差异标志
            List<String> signs = CollUtil.emptyIfNull(unifiedDiff).stream().filter(sign -> sign.startsWith("@@")).collect(Collectors.toList());
            if (signs.size() > 0) {
                signs.forEach(sign -> {
                    if (!sign.equals("@@ -0,0 +0,0 @@")) {

                        List<String> split = Arrays.asList(sign.split(" "));
                        // 源文件标识
                        String var1 = split.get(1);
                        String[] var2 = var1.split(",");
                        int start = Integer.parseInt(var2[0].substring(1)) - 1;
                        int end = start + Integer.parseInt(var2[1]);
                        for (int i = start; i < end; i++) {
                            lineTextPositions1.get(i).setType("1");
                        }

                        // 新文件标识
                        String var3 = split.get(2);
                        String[] var4 = var3.split(",");
                        int start1 = Integer.parseInt(var4[0].substring(1)) - 1;
                        int end1 = start1 + Integer.parseInt(var4[1]);
                        for (int i = start1; i < end1; i++) {
                            lineTextPositions2.get(i).setType("2");
                        }
                    }
                });
            }

            // 输出差异文件
            PDDocument pdDocument1 = PDDocument.load(file1);
            // 突出显示删除
            for (LineTextPosition lineTextPosition : lineTextPositions1) {
                if (lineTextPosition.getType().equals("1")) {
                    PDPage page = pdDocument1.getPage(lineTextPosition.getPageNum() - 1);
                    PDAnnotationTextMarkup markup = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
                    markup.setColor(new PDColor(new float[] { 1, 0, 0}, PDDeviceRGB.INSTANCE));
                    float x = lineTextPosition.getX();
                    float y = lineTextPosition.getY();
                    float width = lineTextPosition.getWidth();
                    float height = lineTextPosition.getHeight();
                    PDRectangle bounds = new PDRectangle(x, y - (height / 2), width, height * 2);
                    markup.setRectangle(bounds);
                    // 从左下角计算 依次为左下、右下、左上、右上
                    float []p=pDRectangle2QuadPoints(bounds);
                    markup.setQuadPoints(p);
                    page.getAnnotations().add(markup);
                }
            }
            pdDocument1.save(outputOldPdfPath);
            pdDocument1.close();

            PDDocument pdDocument2 = PDDocument.load(file2);
            // 突出显示删除
            for (LineTextPosition lineTextPosition : lineTextPositions2) {
                if (lineTextPosition.getType().equals("2")) {
                    PDPage page = pdDocument2.getPage(lineTextPosition.getPageNum() - 1);
                    PDAnnotationTextMarkup markup = new PDAnnotationTextMarkup(PDAnnotationTextMarkup.SUB_TYPE_HIGHLIGHT);
                    markup.setColor(new PDColor(new float[] { 0, 255, 0}, PDDeviceRGB.INSTANCE));
                    float x = lineTextPosition.getX();
                    float y = lineTextPosition.getY();
                    float width = lineTextPosition.getWidth();
                    float height = lineTextPosition.getHeight();
                    PDRectangle bounds = new PDRectangle(x, y - (height / 2), width, height * 2);
                    markup.setRectangle(bounds);
                    // 从左下角计算 依次为左下、右下、左上、右上
                    float []p=pDRectangle2QuadPoints(bounds);
                    markup.setQuadPoints(p);
                    page.getAnnotations().add(markup);
                }
            }
            pdDocument2.save(outputNewPdfPath);
            pdDocument2.close();
        } catch (IOException e) {
            return 1;
        }
        return 0;
    }

    /**
     * 获取上下文差异描述信息
     * @param originalFileName 源文件名称
     * @param original 源文件文本信息
     * @param revisedFileName 新文件名称
     * @param revised 新文件文本信息
     * @return
     */
    public static List<String> getUnifiedDiff(String originalFileName, List<String> original, String revisedFileName, List<String> revised) {
        Patch<String> patch = com.github.difflib.DiffUtils.diff(original, revised);
        List<String> unifiedDiff = UnifiedDiffUtils.generateUnifiedDiff(originalFileName, revisedFileName, original, patch, 0);
        int diffCount = unifiedDiff.size();
        if (diffCount == 0) {
            //如果两文件没差异则插入如下
            unifiedDiff.add("--- " + originalFileName);
            unifiedDiff.add("+++ " + revisedFileName);
            unifiedDiff.add("@@ -0,0 +0,0 @@");
        } else if (diffCount >= 3 && !unifiedDiff.get(2).contains("@@ -1,")) {
            // 如果至少有一处变化,并且变化不在第一行
            unifiedDiff.set(1, unifiedDiff.get(1));
            //如果第一行没变化则插入@@ -0,0 +0,0 @@
            unifiedDiff.add(2, "@@ -0,0 +0,0 @@");
        }
        return unifiedDiff;
    }

    // 矩阵置换坐标
    private static float[] pDRectangle2QuadPoints(PDRectangle bounds) {
        float []p=new float[8];
        p[0]=bounds.getLowerLeftX();
        p[1]=bounds.getLowerLeftY();
        p[2]=bounds.getUpperRightX();
        p[3]=bounds.getLowerLeftY();
        p[4]=bounds.getLowerLeftX();
        p[5]=bounds.getUpperRightY();
        p[6]=bounds.getUpperRightX();
        p[7]=bounds.getUpperRightY();
        return p;
    }

    /**
     * 构建PDFTextStripper
     * @param file
     * @return
     */
    private static List<GetCharLocationAndSize> buildCharLocationAndSize(File file) {
        List<GetCharLocationAndSize> pdfTextStripperList;
        PDDocument pdDocument = null;
        try {
            pdDocument = PDDocument.load(file);
            int pageCount = pdDocument.getNumberOfPages();
            pdfTextStripperList = new ArrayList<>(pageCount);
            for (int i = 1; i <= pageCount; i++) {
                GetCharLocationAndSize pdfTextStripper = new GetCharLocationAndSize();
                pdfTextStripper.setPageNum(i);
                pdfTextStripper.setSortByPosition(true);
                pdfTextStripper.setStartPage(i);
                pdfTextStripper.setEndPage(i);
                StringWriter writer = new StringWriter();
                pdfTextStripper.writeText(pdDocument, writer);
                pdfTextStripperList.add(pdfTextStripper);
            }
        } catch (Exception e) {
            return null;
        } finally {
            if (pdDocument != null) {
                try {
                    pdDocument.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return pdfTextStripperList;
    }
}

工具类中用到的其他自定义的类

GetCharLocationAndSize.class   、  LineTextPosition.class

package com.pdf.pdfcomparer.overide;

import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * 每页pdf对应一个实例
 */
public class GetCharLocationAndSize extends PDFTextStripper {

    private final List<LineTextPosition> lineTextPositions = new ArrayList<>();

    private int pageNum = 0;

    public GetCharLocationAndSize() throws IOException {
    }

    @Override
    protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
        // 记录行文本和每个字符位置信息
        LineTextPosition lineTextPosition = new LineTextPosition();
        lineTextPosition.setLineText(text);
        // 计算本段文字矩阵参数,距离页左上角的坐标
        TextPosition textPosition = textPositions.get(0);
        float pageHeight = textPosition.getPageHeight();
        float x = textPosition.getXDirAdj();
        float y = pageHeight - textPosition.getYDirAdj();
        float width = 0.0f;
        float height = 0.0f;
        for (TextPosition textPosition1 : textPositions) {
            width += textPosition1.getWidthDirAdj();
            if (textPosition1.getHeightDir() > height) {
                height = textPosition1.getHeightDir();
            }
        }
        lineTextPosition.setX(x);
        lineTextPosition.setY(y);
        lineTextPosition.setWidth(width);
        lineTextPosition.setHeight(height);
        lineTextPosition.setPageNum(pageNum);
        lineTextPositions.add(lineTextPosition);
        writeString(text);
    }

    public List<LineTextPosition> getLineTextPositions(){
        return lineTextPositions;
    }

    public int getPageNum() {
        return pageNum;
    }

    public void setPageNum(int pageNum) {
        this.pageNum = pageNum;
    }
}
package com.pdf.pdfcomparer.overide;


public class LineTextPosition {

    /**
     * 行文本
     */
    private String lineText;

    /**
     * 文本矩阵左下坐标x
     */
    private float x;

    /**
     * 文本矩阵左下坐标y
     */
    private float y;

    /**
     * 宽度
     */
    private float width;

    /**
     * 高度
     */
    private float height;

    /**
     * 所在页数
     */
    private int pageNum;

    /**
     * 类型 "0":原文 ,"1":删除 ,"2":新增
     */
    private String type = "0";

    public String getLineText() {
        return lineText;
    }

    public void setLineText(String lineText) {
        this.lineText = lineText;
    }

    public float getX() {
        return x;
    }

    public void setX(float x) {
        this.x = x;
    }

    public float getY() {
        return y;
    }

    public void setY(float y) {
        this.y = y;
    }

    public float getWidth() {
        return width;
    }

    public void setWidth(float width) {
        this.width = width;
    }

    public float getHeight() {
        return height;
    }

    public void setHeight(float height) {
        this.height = height;
    }

    public int getPageNum() {
        return pageNum;
    }

    public void setPageNum(int pageNum) {
        this.pageNum = pageNum;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }
}

 

评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

long_ky

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值