java获取pdf文字坐标_Java 获取PDF关键字坐标

该博客介绍了如何使用iTextPDF库和PDFBox库在Java中获取PDF文件中关键字的坐标。通过解析PDF内容,可以获取到关键字在每一页的位置信息,包括X坐标、Y坐标、字体长度和高度。
摘要由CSDN通过智能技术生成

一、使用 itextpdf  推荐使用

com.itextpdf

itextpdf

5.5.13.1

PdfKeyWordPosition.java

package com.util;

import com.itextpdf.awt.geom.Rectangle2D;

import com.itextpdf.text.pdf.PdfDictionary;

import com.itextpdf.text.pdf.PdfName;

import com.itextpdf.text.pdf.PdfReader;

import com.itextpdf.text.pdf.parser.*;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import java.io.ByteArrayOutputStream;

import java.io.FileInputStream;

import java.io.IOException;

import java.io.InputStream;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

/**

* 获取pdf关键字坐标

*/

public class PdfKeyWordPosition {

private static final Logger log = LoggerFactory.getLogger(PdfKeyWordPosition.class);

/**

* 获取关键字坐标

* @param pdfData

* @param keyWord

* @return

*/

public static List> getWordsPcoordinate(byte[] pdfData, String keyWord){

List> result = new ArrayList<>();

PdfReader reader = null;

try {

// pdfData :可以是二进制,也可以是文件路径,两种方式选择一种

reader = new PdfReader(pdfData);

//获取pdf页数

int pages = reader.getNumberOfPages();

for (int pageNum = 1; pageNum <= pages; pageNum++) {

//每页的宽度

Float width = reader.getPageSize(pageNum).getWidth();

//每页的高度

Float height = reader.getPageSize(pageNum).getHeight();

RenderListenerHelper renderListenerHelper = new RenderListenerHelper(pageNum, width, height);

//解析pdf,定位位置

PdfContentStreamProcessor processor = new PdfContentStreamProcessor(renderListenerHelper);

PdfDictionary pageDic = reader.getPageN(pageNum);

PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES);

processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNum), resourcesDic);

//文本内容

String content = renderListenerHelper.getContent();

//文本每个字对应的坐标

List> charPositions = renderListenerHelper.getCharPositions();

for (int i = 0; i < content.length(); i++){

//获取关键字所在位置

int keyIndex = content.indexOf(keyWord, i);

if (keyIndex == -1){

break;

}

result.add(charPositions.get(keyIndex));

i = keyIndex + 1;

}

}

} catch (Exception e){

log.error("获取pdf关键字坐标失败:{}", e);

} finally {

reader.close();

}

return result;

}

/**

* 重写 itextpdf 的 RenderListener 类里的方法

*/

private static class RenderListenerHelper implements RenderListener {

private int pageNum;

private float pageWidth;

private float pageHeight;

private StringBuilder contentBuilder = new StringBuilder();

private List> charPositions = new ArrayList<>();

public RenderListenerHelper(int pageNum, float pageWidth, float pageHeight) {

this.pageNum = pageNum;

this.pageWidth = pageWidth;

this.pageHeight = pageHeight;

}

public String getContent() {

return contentBuilder.toString();

}

public List> getCharPositions() {

return charPositions;

}

//step 2 遇到"BT"执行

@Override

public void beginTextBlock() {

}

//step 3 文字主要处理方法

@Override

public void renderText(TextRenderInfo renderInfo) {

//获取文本内容每个字信息集合

List characterRenderInfos = renderInfo.getCharacterRenderInfos();

for (TextRenderInfo textRenderInfo : characterRenderInfos) {

String word = textRenderInfo.getText();

if (word.length() > 1) {

word = word.substring(word.length() - 1);

}

//关键字上边缘坐标

//Rectangle2D.Float boundingRectange = textRenderInfo.getAscentLine().getBoundingRectange();

//关键字标准坐标(中间)

Rectangle2D.Float boundingRectange = textRenderInfo.getBaseline().getBoundingRectange();

//关键字下边缘坐标

//Rectangle2D.Float boundingRectange = textRenderInfo.getDescentLine().getBoundingRectange();

//正常坐标

Float x = boundingRectange.x;

Float y = boundingRectange.y;

/*

//中心坐标

float x = (float)boundingRectange.getCenterX();

float y = (float)boundingRectange.getCenterY();

//最大最小坐标

double x = boundingRectange.getMinX();

double y = boundingRectange.getMaxY();

//这两个是关键字在所在页面的XY轴的百分比

float xPercent = Math.round(x / pageWidth * 10000) / 10000f;

float yPercent = Math.round((1 - y / pageHeight) * 10000) / 10000f;

*/

Map coordinate = new HashMap<>();

coordinate.put("x", x);

coordinate.put("y", y);

coordinate.put("pageNum", pageNum); //页数

coordinate.put("fontWidth", boundingRectange.width); //字体长度

coordinate.put("fontHeight", boundingRectange.height); //字段高度

charPositions.add(coordinate);

contentBuilder.append(word);

}

}

//step 4(最后执行的,只执行一次),遇到“ET”执行

@Override

public void endTextBlock() {

}

//step 1(图片处理方法)

@Override

public void renderImage(ImageRenderInfo renderInfo) {

}

}

public static void main(String[] args) {

try {

InputStream is = null;

ByteArrayOutputStream bos = new ByteArrayOutputStream();

try {

is = new FileInputStream("D:\\test.pdf");

byte[] buffer = new byte[is.available()];

Integer n = 0;

while ((n = is.read(buffer)) != -1) {

bos.write(buffer, 0, n);

}

} catch (IOException e) {

e.printStackTrace();

} finally {

try {

bos.close();

if (is != null) {

is.close();

}

} catch (IOException e) {

e.printStackTrace();

}

}

byte[] bytes = bos.toByteArray();

List> wordsPcoordinates = getWordsPcoordinate(bytes,"日期");

for (Map map : wordsPcoordinates){

System.out.println("x坐标 -> " + map.get("x"));

System.out.println("y坐标 -> " + map.get("y"));

System.out.println("页数 -> " + map.get("pageNum"));

System.out.println("字体长度 -> " + map.get("fontWidth"));

System.out.println("字段高度 -> " + map.get("fontHeight"));

System.out.println("");

}

} catch (Exception e) {

e.printStackTrace();

}

}

}

二、使用 pdfbox

org.apache.pdfbox

pdfbox

2.0.20

PdfBoxKeyWordPosition.java

package com.util;

import org.apache.pdfbox.pdmodel.PDDocument;

import org.apache.pdfbox.text.PDFTextStripper;

import org.apache.pdfbox.text.TextPosition;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import java.io.*;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

/**

* 继承 pdfbox 中 PDFTextStripper类,获取关键字坐标

*/

public class PdfBoxKeyWordPosition extends PDFTextStripper {

private static final Logger log = LoggerFactory.getLogger(PdfBoxKeyWordPosition.class);

//关键字字符数组

private char[] key;

//PDF文件路径

private String pdfPath;

//二进制文件

private byte[] bytes;

//坐标集合

private List> coordinates = new ArrayList<>();

// 当前页坐标集合

private List> pageList = new ArrayList<>();

/*//使用文件路径

public PdfBoxKeyWordPosition(String keyWords, String pdfPath) throws IOException {

super();

super.setSortByPosition(true);

this.pdfPath = pdfPath;

char[] key = new char[keyWords.length()];

for (int i = 0; i < keyWords.length(); i++) {

key[i] = keyWords.charAt(i);

}

this.key = key;

}*/

//使用二进制数据

public PdfBoxKeyWordPosition(String keyWords, byte[] bytes) throws IOException {

super();

super.setSortByPosition(true);

this.bytes = bytes;

char[] key = new char[keyWords.length()];

for (int i = 0; i < keyWords.length(); i++) {

key[i] = keyWords.charAt(i);

}

this.key = key;

}

// 获取坐标信息

public List> getCoordinate(){

try {

//document = PDDocument.load(new File(pdfPath)); 文件地址

document = PDDocument.load(bytes); //文件二进制数据

int pages = document.getNumberOfPages();

for (int i = 1; i <= pages; i++) {

super.setSortByPosition(true);

super.setStartPage(i);

super.setEndPage(i);

Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());

super.writeText(document, dummy);

for (Map li : pageList) {

li.put("pageNum", i);

}

coordinates.addAll(pageList);

pageList.clear();

}

} catch (Exception e) {

log.error("获取pdf关键字坐标失败:{}", e);

} finally {

pageList.clear();

try {

if (document != null) {

document.close();

}

} catch (IOException e) {

log.error("关闭文件失败:{}", e);

}

}

return coordinates;

}

// 获取坐标信息

@Override

protected void writeString(String string, List textPositions) throws IOException {

for (int i = 0; i < textPositions.size(); i++) {

String str = textPositions.get(i).getUnicode();

//找到 key 中第一位所在位置

if (str.equals(String.valueOf(key[0]))) {

int count = 0;

for (int j = 0; j < key.length; j++) {

String s = "";

try {

s = textPositions.get(i + j).getUnicode();

} catch (Exception e) {

s = "";

}

//判断key 中每一位是否和文本中顺序对应,一旦不等说明 关键字与本段落不等,则停止本次循环

if (s.equals(String.valueOf(key[j]))) {

count++;

} else if (count > 0){

break;

}

}

//判断 key 中字 在文本是否连续,是则获取坐标

if (count == key.length) {

Map coordinate = new HashMap<>();

TextPosition tp = textPositions.get(i);

// X坐标 在这里加上了字体的长度,也可以直接 tp.getX()

Float x = tp.getX() + tp.getFontSize();

// Y坐标 在这里减去的字体的长度,也可以直接 tp.getPageHeight() - tp.getY()

Float y = tp.getPageHeight() - tp.getY() - 4 * tp.getFontSize();

coordinate.put("x", x);

coordinate.put("y", y);

pageList.add(coordinate);

}

}

}

}

public static void main(String[] args) {

try {

InputStream is = null;

ByteArrayOutputStream bos = new ByteArrayOutputStream();

try {

is = new FileInputStream("D:\\test.pdf");

byte[] buffer = new byte[is.available()];

Integer n = 0;

while ((n = is.read(buffer)) != -1) {

bos.write(buffer, 0, n);

}

} catch (IOException e) {

e.printStackTrace();

} finally {

try {

bos.close();

if (is != null) {

is.close();

}

} catch (IOException e) {

e.printStackTrace();

}

}

byte[] bytes = bos.toByteArray();

PdfBoxKeyWordPosition pdf = new PdfBoxKeyWordPosition("日期", bytes);

List> wordsPcoordinates = pdf.getCoordinate();

for (Map map : wordsPcoordinates){

System.out.println("x坐标 -> " + map.get("x"));

System.out.println("y坐标 -> " + map.get("y"));

System.out.println("页面 -> " + map.get("pageNum"));

System.out.println("");

}

} catch (Exception e) {

e.printStackTrace();

}

}

}

获取 PDF 关键字坐标,可以使用 Apache PDFBox 库。以下是获取 PDF 关键字坐标的示例代码: ```java import java.io.File; import java.io.IOException; import java.util.List; import org.apache.pdfbox.cos.COSArray; import org.apache.pdfbox.cos.COSBase; import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup.MarkupTypeEnum; import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationTextMarkup.QuadPoints; public class PDFKeywordCoordinates { public static void main(String[] args) throws IOException { String filePath = "example.pdf"; String keyword = "example"; PDDocument document = PDDocument.load(new File(filePath)); List<PDPage> pages = document.getDocumentCatalog().getAllPages(); for (PDPage page : pages) { List<PDAnnotation> annotations = page.getAnnotations(); for (PDAnnotation annotation : annotations) { if (annotation instanceof PDAnnotationTextMarkup) { PDAnnotationTextMarkup markup = (PDAnnotationTextMarkup) annotation; if (markup.getMarkupType() == MarkupTypeEnum.HIGHLIGHT) { COSDictionary dict = markup.getCOSObject(); COSArray quadPointsArray = (COSArray) dict.getDictionaryObject(COSName.getPDFName("QuadPoints")); for (int i = 0; i < quadPointsArray.size(); i += 8) { float x1 = quadPointsArray.getFloat(i); float y1 = quadPointsArray.getFloat(i + 1); float x2 = quadPointsArray.getFloat(i + 2); float y2 = quadPointsArray.getFloat(i + 3); float x3 = quadPointsArray.getFloat(i + 4); float y3 = quadPointsArray.getFloat(i + 5); float x4 = quadPointsArray.getFloat(i + 6); float y4 = quadPointsArray.getFloat(i + 7); if (containsKeyword(page, keyword, x1, y1, x2, y2, x3, y3, x4, y4)) { System.out.println("Keyword '" + keyword + "' found on page " + (pages.indexOf(page) + 1) + " at (" + x1 + "," + y1 + ") (" + x2 + "," + y2 + ") (" + x3 + "," + y3 + ") (" + x4 + "," + y4 + ")"); } } } } } } document.close(); } private static boolean containsKeyword(PDPage page, String keyword, float x1, float y1, float x2, float y2, float x3, float y3, float x4, float y4) throws IOException { StringBuilder sb = new StringBuilder(); sb.append("q\n"); sb.append(x1).append(' ').append(y1).append(" m\n"); sb.append(x2).append(' ').append(y2).append(" l\n"); sb.append(x3).append(' ').append(y3).append(" l\n"); sb.append(x4).append(' ').append(y4).append(" l\n"); sb.append("h\n"); sb.append("W* n\n"); sb.append("BT\n"); sb.append("/Helv 12 Tf\n"); sb.append("0 g\n"); sb.append("1 0 0 1 ").append(x1).append(' ').append(y1).append(" Tm\n"); sb.append("(").append(keyword).append(") Tj\n"); sb.append("ET\n"); sb.append("Q\n"); return page.getContents().stream().anyMatch(content -> content.getString().contains(sb.toString())); } } ``` 在此示例代码中,我们首先加载 PDF 文件并获取所有页面。然后,我们遍历每个页面的所有注释,并查找类型为“高亮”的注释。对于每个高亮注释,我们获取该注释的四个顶点坐标,并检查关键字是否包含在其中。如果找到匹配项,则打印关键字坐标
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值