目录
1. maven配置
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.8.11</version>
</dependency>
<!--itext7 pom-->
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itext7-core</artifactId>
<version>7.2.0</version>
<type>pom</type>
</dependency>
2. 实体类
package com.example.demo.itext.model;
import lombok.Data;
import java.io.Serializable;
@Data
public class KeyWordBean implements Comparable<KeyWordBean>, Serializable {
private float x;
private float y;
private float width;
private float height;
// pdf的页面
private int page;
// 当前页面中第几个
private int num;
private String text;
@Override
public int compareTo(KeyWordBean o) {
// 先按照Y轴排序
int i = (int) (o.getY() - this.getY());
if (i == 0) {
// 如果Y轴相等了再按X轴进行排序
return (int) (this.x - o.getX());
}
return i;
}
}
3. java代码
package com.example.demo.itext.util;
import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.json.JSONUtil;
import com.example.demo.itext.model.KeyWordBean;
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfPage;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.canvas.parser.PdfCanvasProcessor;
import com.itextpdf.kernel.pdf.canvas.parser.listener.IPdfTextLocation;
import com.itextpdf.kernel.pdf.canvas.parser.listener.RegexBasedLocationExtractionStrategy;
import java.io.IOException;
import java.util.*;
public class ItextPDFUtil {
public static void main(String args[]) throws IOException {
String path = "F:\\software\\myfile\\txt12_加水印.pdf";
System.out.println("关键字在PDF文件中的文字信息:" + JSONUtil.toJsonStr(keyWordLocationMap("负责人签名:", path)));
}
/**
* 功能描述: 获取关键字在pdf中的坐标 <br>
*
* @Param: [KEY_WORD:关键字, input:pdf文件路径]
* @Return: java.util.Map<java.lang.Integer,java.util.List<com.example.demo.itext.model.KeyWordBean>>
* @Author: lhp
* @Date: 2023/1/29 14:53
*/
public static Map<Integer, List<KeyWordBean>> keyWordLocationMap(String KEY_WORD, String input) {
Map<Integer, List<KeyWordBean>> listMap;
PdfDocument pdfDocument = null;
try {
PdfReader reader = new PdfReader(input);
pdfDocument = new PdfDocument(reader);
int pageNumbers = pdfDocument.getNumberOfPages();
listMap = new HashMap<>(pageNumbers);
for (int i = 1; i <= pageNumbers; i++) {
PdfPage page = pdfDocument.getPage(i);
RegexBasedLocationExtractionStrategy strategy = new RegexBasedLocationExtractionStrategy(KEY_WORD);
PdfCanvasProcessor canvasProcessor = new PdfCanvasProcessor(strategy);
canvasProcessor.processPageContent(page);
Collection<IPdfTextLocation> resultantLocations = strategy.getResultantLocations();
//自定义结果处理
if (!resultantLocations.isEmpty()) {
List<KeyWordBean> keyWordBeanList = new ArrayList<>();
List<IPdfTextLocation> iPdfTextLocationList = CollectionUtil.newArrayList(resultantLocations);
for (int m = 0; m < iPdfTextLocationList.size(); m++) {
IPdfTextLocation item = iPdfTextLocationList.get(m);
Rectangle boundRectangle = item.getRectangle();
KeyWordBean keyWordBean = new KeyWordBean();
keyWordBean.setPage(item.getPageNumber());
keyWordBean.setX(boundRectangle.getX());
keyWordBean.setY(boundRectangle.getY());
keyWordBean.setWidth(boundRectangle.getWidth());
keyWordBean.setHeight(boundRectangle.getHeight());
keyWordBean.setText(item.getText());
keyWordBean.setNum(m + 1);
System.out.println("关键字“" + KEY_WORD + "” 的坐标为 x: " + boundRectangle.getX() + " ,y: " + boundRectangle.getY());
keyWordBeanList.add(keyWordBean);
}
listMap.put(i, keyWordBeanList);
}
}
pdfDocument.close();
} catch (IOException e) {
throw new RuntimeException(e);
} finally {
if (pdfDocument != null) {
pdfDocument.close();
}
}
return listMap;
}
}