一、使用 itextpdf 推荐使用
com.itextpdf
itextpdf
5.5.13.1
PdfKeyWordPosition.java
package com.util;
import com.itextpdf.awt.geom.Rectangle2D;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 获取pdf关键字坐标
*/
public class PdfKeyWordPosition {
private static final Logger log = LoggerFactory.getLogger(PdfKeyWordPosition.class);
/**
* 获取关键字坐标
* @param pdfData
* @param keyWord
* @return
*/
public static List> getWordsPcoordinate(byte[] pdfData, String keyWord){
List> result = new ArrayList<>();
PdfReader reader = null;
try {
// pdfData :可以是二进制,也可以是文件路径,两种方式选择一种
reader = new PdfReader(pdfData);
//获取pdf页数
int pages = reader.getNumberOfPages();
for (int pageNum = 1; pageNum <= pages; pageNum++) {
//每页的宽度
Float width = reader.getPageSize(pageNum).getWidth();
//每页的高度
Float height = reader.getPageSize(pageNum).getHeight();
RenderListenerHelper renderListenerHelper = new RenderListenerHelper(pageNum, width, height);
//解析pdf,定位位置
PdfContentStreamProcessor processor = new PdfContentStreamProcessor(renderListenerHelper);
PdfDictionary pageDic = reader.getPageN(pageNum);
PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES);
processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNum), resourcesDic);
//文本内容
String content = renderListenerHelper.getContent();
//文本每个字对应的坐标
List> charPositions = renderListenerHelper.getCharPositions();
for (int i = 0; i < content.length(); i++){
//获取关键字所在位置
int keyIndex = content.indexOf(keyWord, i);
if (keyIndex == -1){
break;
}
result.add(charPositions.get(keyIndex));
i = key