概览
基于PDFBOX,按行解析PDF,得到文字信息,然后根据定义的关键词进行匹配,定位文字坐标,从而实现根据文字划定区域并对内容进行替换或者隐藏;
Maven
<!-- 引入 PDFBox 相关的依赖 开始 -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.29</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
<version>2.0.29</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>xmpbox</artifactId>
<version>2.0.29</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>preflight</artifactId>
<version>2.0.29</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox-tools</artifactId>
<version>2.0.29</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>jempbox</artifactId>
<version>1.8.17</version>
</dependency>
<!-- 引入 PDFBox 相关的依赖 结束 -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.30</version>
</dependency>
查找的关键字实体类
package com.xrl;
import lombok.Data;
import java.io.Serializable;
/**
* @version [v1.0]
* @author: [xrl]
* @create: [2024/03/05 17:59]
* @Description: 查找的关键字实体类
**/
@Data
public class KeyWordEntity implements Serializable {
/**
* 关键词
*/
private String keyWord;
/**
* 序号
*/
private Integer serialNumber;
/**
* pdf页上x坐标
*/
private float x;
/**
* pdf页上y坐标
*/
private float y;
/**
* 关键词的长度
*/
private float width;
/**
* 关键词的高度
*/
private float height;
}
自定义文本提取器
package com.xrl;
import lombok.Getter;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
/**
* @version [v1.0]
* @author: [xrl]
* @create: [2024/03/05 17:59]
* @Description: 自定义文本提取器,获取查找文本的坐标位置
**/
public class KeyWordPositionStripper extends PDFTextStripper {
/**
* 查找的关键字集合
*/
private final List<String> keyWordList;
/**
* 查找成功的关键字实体对象集合
*/
@Getter
private final List<KeyWordEntity> keyWordEntityList = new ArrayList<>();
public KeyWordPositionStripper(List<String> keyWordList) throws IOException {
this.keyWordList = keyWordList;
}
@Override
protected void writeString(String text, List<TextPosition> positions) {
int size = positions.size();
int num = 0;
for (String keyWord : keyWordList) {
char[] chars = keyWord.toCharArray();
for (int i = 0; i < size; i++) {
// 获取当前读取的字符
String currentChar = positions.get(i).getUnicode();
// 当前字符 和 keyWord 关键字进行匹配
if (!Objects.equals(currentChar, String.valueOf(chars[0]))) {
continue;
}
int count = 1;
int j;
for (j = 1; j < chars.length && i + j < size; j++) {
currentChar = positions.get(i + j).getUnicode();
if (!Objects.equals(currentChar, String.valueOf(chars[j]))) {
break;
}
count++;
}
// 匹配成功,记录文本的坐标位置
if (!Objects.equals(count, chars.length)) {
continue;
}
TextPosition startPosition = positions.get(i);
TextPosition endPosition = positions.get(i + j < size ? i + j : i + j - 1);
// 创建实体对象
KeyWordEntity entity = new KeyWordEntity();
entity.setKeyWord(keyWord);
// 获取起始字符坐标
Matrix matrix = startPosition.getTextMatrix();
float x = matrix.getTranslateX();
float y = matrix.getTranslateY();
// 获取结束字符坐标
Matrix endMatrix = endPosition.getTextMatrix();
float x2 = endMatrix.getTranslateX();
// 获取字体大小
float fontSizeInPt = startPosition.getFontSizeInPt();
entity.setX(x);
entity.setY(y - fontSizeInPt / 5);
float width = i + j < size ? x2 - x : x2 - x + fontSizeInPt;
entity.setWidth(width);
entity.setHeight(fontSizeInPt);
entity.setSerialNumber(num++);
keyWordEntityList.add(entity);
}
}
}
}
基于PDFBox的模板类
package com.xrl;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.*;
import java.util.List;
/**
* @version [v1.0]
* @author: [xrl]
* @create: [2024/03/05 17:59]
* @Description: 基于PDFBox的模板类
**/
public abstract class PDFTextParsingAbstract {
private static final String HTTP_PREFIX = "http://";
private static final String HTTPS_PREFIX = "https://";
/**
* 读取PDF模板文件,替换指定关键字的数据
*
* @param keyWordMap 需要替换的关键字数据,key表示占位符,value表示替换后的内容
* @param pdfPath PDF模板文件的路径
* @param destPdfPath 生成的目标PDF文件路径
* @param fontPath 字体文件路径
*/
public void replaceText(Map<String, String> keyWordMap, String pdfPath, String destPdfPath, String fontPath) {
if (keyWordMap == null || keyWordMap.keySet().isEmpty()) {
return;
}
Set<String> keyWordSet = keyWordMap.keySet();
// 1、读取PDF模板文件
PDDocument document;
try {
if (pdfPath.startsWith(HTTP_PREFIX) || pdfPath.startsWith(HTTPS_PREFIX)) {
URLConnection pdfPathConn = new URL(pdfPath).openConnection();
InputStream pdfIn = pdfPathConn.getInputStream();
document = PDDocument.load(pdfIn);
} else {
document = PDDocument.load(new File(pdfPath));
}
} catch (IOException e) {
throw new RuntimeException("读取PDF模板文件异常 [异常信息]: " + e);
}
// 2、创建自定义文本提取器
KeyWordPositionStripper stripper;
try {
stripper = new KeyWordPositionStripper(new ArrayList<>(keyWordSet));
stripper.setSortByPosition(true);
stripper.getText(document);
} catch (IOException e) {
throw new RuntimeException("创建自定义文本提取器异常 [异常信息]: " + e);
}
// 3、获取关键字实体对象
List<KeyWordEntity> keyWordEntityList = stripper.getKeyWordEntityList();
// 4、加载外部字体文件
PDType0Font font;
try {
if (fontPath.startsWith(HTTP_PREFIX) || fontPath.startsWith(HTTPS_PREFIX)) {
URLConnection conn = new URL(fontPath).openConnection();
InputStream is = conn.getInputStream();
font = PDType0Font.load(document, is);
} else {
font = PDType0Font.load(document, new File(fontPath));
}
} catch (IOException e) {
throw new RuntimeException("加载外部字体文件异常 [异常信息]: " + e);
}
// 5、自定义替换规则
replacementRule(keyWordMap, keyWordEntityList, document, font);
//6、 保存替换之后的文档
try {
if (destPdfPath.startsWith(HTTP_PREFIX) || destPdfPath.startsWith(HTTPS_PREFIX)) {
saveOssPdf(document, destPdfPath);
} else {
document.save(destPdfPath);
}
} catch (IOException e) {
throw new RuntimeException("文件保存异常 [异常信息]: " + e);
}
//7、 关闭文档
try {
document.close();
} catch (IOException e) {
throw new RuntimeException("关闭文档异常 [异常信息]: " + e);
}
}
/**
* 保存pdf文件到oss
*
* @param document pdf模版文件
* @param destPdfPath 生成的目标PDF文件路径
*/
public abstract void saveOssPdf(PDDocument document, String destPdfPath);
/**
* 替换规则实现
*
* @param keyWordMap 需要替换的关键字数据,key表示占位符,value表示替换后的内容
* @param keyWordEntityList 关键字实体对象
* @param document pdf模版文件
* @param font 字体文件
* @throws IOException
*/
public abstract void replacementRule(Map<String, String> keyWordMap, List<KeyWordEntity> keyWordEntityList, PDDocument document, PDType0Font font);
}
测试实现类
package com.xrl;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import java.awt.*;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.URL;
import java.nio.file.Paths;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
/**
* @version [v1.0]
* @author: [xrl]
* @create: [2024/03/07 11:03]
* @Description: 测试实现类
**/
public class PDFTextParsingAbstractImpl extends PDFTextParsingAbstract {
/**
* 保存pdf文件到oss
*
* @param document pdf模版文件
* @param destPdfPath 生成的目标PDF文件路径
*/
@Override
public void saveOssPdf(PDDocument document, String destPdfPath) {
try {
//1、本地创建临时文件夹
File file = new File("temp");
if (!file.exists() && !file.mkdirs()) {
throw new RuntimeException("临时目录创建失败");
}
//2、构建文件名
URL fileUrl = new URL(destPdfPath);
String fileName = Paths.get(fileUrl.getPath()).getFileName().toString();
fileName = file.getAbsolutePath() + File.separator + fileName;
//3、保存文件文件
FileOutputStream fos = new FileOutputStream(fileName);
document.save(fos);
fos.close();
//4、上传线上 如oss
//5、删除临时文件
} catch (Exception e) {
throw new RuntimeException("[文件:" + destPdfPath + "] 获取失败");
}
}
/**
* 替换规则实现
*
* @param keyWordMap 需要替换的关键字数据,key表示占位符,value表示替换后的内容
* @param keyWordEntityList 关键字实体对象
* @param document pdf模版文件
* @param font 字体文件
*/
@Override
public void replacementRule(Map<String, String> keyWordMap, List<KeyWordEntity> keyWordEntityList, PDDocument document, PDType0Font font) {
Map<String, List<KeyWordEntity>> map = keyWordEntityList.stream()
.collect(Collectors.groupingBy(KeyWordEntity::getKeyWord));
Set<String> keyWordSet = keyWordMap.keySet();
for (int t = 0; t < document.getNumberOfPages(); t++) {
try {
//1、得到页面
PDPageContentStream stream = new PDPageContentStream(document, document.getPage(t), PDPageContentStream.AppendMode.APPEND, true);
//2、循环替换指定关键字文本内容
int size = keyWordSet.size();
for (int i = 0; i < keyWordEntityList.size() / size; i++) {
KeyWordEntity entity1 = map.get("姓 名:").get(i);
KeyWordEntity entity2 = map.get("性 别:").get(i);
// 设置画笔颜色
stream.setNonStrokingColor(Color.blue);
// 划定覆盖区域
stream.addRect(entity1.getX() + entity1.getWidth() , entity1.getY(),
entity2.getX() - entity1.getX() - entity1.getWidth(),entity1.getHeight() + 1 );
stream.fill();
// 替换关键字文本内容
stream.beginText();
stream.setFont(font, 14);
stream.newLineAtOffset(entity1.getX(), entity1.getY());
stream.showText(keyWordMap.get(entity1.getKeyWord()));
stream.endText();
}
//3、关闭内容流
stream.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
}
测试运行
package com.xrl;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
/**
* @version [v1.0]
* @author: [xrl]
* @create: [2024/03/07 11:04]
**/
public class PDFMain {
public static void main(String[] args) throws IOException {
Map<String, String> keyWordMap = new HashMap<>(3);
keyWordMap.put("姓 名:", "");
keyWordMap.put("性 别:", "");
// 模拟测试 隐藏姓名
PDFTextParsingAbstractImpl pdfTextParsing = new PDFTextParsingAbstractImpl();
// pdfTextParsing.replaceText(keyWordMap,
// "https://******/20240307112342.pdf",
// "https://******/20240307112342.pdf",
// "https://******/源柔黑体.ttf");
pdfTextParsing.replaceText(keyWordMap,
"E:/work_space_zw/pdf/pdf/4020.pdf",
"E:\\work_space_zw\\pdf\\pdf\\4020.pdf",
"E:\\work_space_zw\\pdf\\pdf\\源柔黑体.ttf");
}
}
测试效果
运行前
运行后