java 获取pdf的坐标_获取 pdf 关键字坐标

本文介绍了一个Java工具类`PdfPositionTool`,用于从PDF文件中获取特定关键字的坐标。通过解析PDF内容,可以获取到关键字在每一页的X和Y坐标,以便进行后续处理。主要使用了iText库来解析PDF。
摘要由CSDN通过智能技术生成

packageDemo.qd;importcom.itextpdf.awt.geom.Rectangle2D.Float;importcom.itextpdf.text.pdf.PdfDictionary;importcom.itextpdf.text.pdf.PdfName;importcom.itextpdf.text.pdf.PdfReader;import com.itextpdf.text.pdf.parser.*;importjava.io.File;importjava.io.FileInputStream;importjava.io.IOException;importjava.math.BigDecimal;importjava.util.ArrayList;importjava.util.List;public classPdfPositionTool {public static void main(String[] args) throwsException {

PdfPositionTool pdfPositionTool= newPdfPositionTool();

List positions = pdfPositionTool.getPositions("/Users/yourouniu/Desktop/111.pdf", "%盖章处%");if (positions != null && positions.size() > 0) {for (double[] position : positions) {

System.out.println("pageNum: " + (int) position[0]);

System.out.println("x: " + position[1]);

System.out.println("y: " + position[2]);

}

}

}/***@returnList 坐标数组:float[0]:页码,float[1]:x ,float[2]:y

* @Description 获取关键字坐标

* @Param filePath:pdf 路径

* @Param keyword:关键字*/

public List getPositions(String filePath, String keyword) throwsIOException {

PdfPositionTool pdfPositionTool= newPdfPositionTool();//1.给定文件

File pdfFile = newFile(filePath);//2.定义一个byte数组,长度为文件的长度

byte[] pdfData = new byte[(int) pdfFile.length()];//3.IO流读取文件内容到byte数组

FileInputStream inputStream = null;try{

inputStream= newFileInputStream(pdfFile);

inputStream.read(pdfData);

}catch(IOException e) {throwe;

}finally{if (inputStream != null) {try{

inputStream.close();

}catch(IOException e) {

}

}

}//5.调用方法,给定关键字和文件

List positions =pdfPositionTool.findKeywordPostions(pdfData, keyword);returnpositions;

}/*** @Description pdf 坐标转换为 ofd 坐标,比值为 25.4/72 ,该转换存在误差

* 最好的转换方式为按距离原点的百分比计算*/

private double transForPosition(doublepdfPosition) {double ofdPosition = pdfPosition * 25.4 / 72;returnofdPosition;

}/***@parampdfData 通过IO流 PDF文件转化的byte数组

*@paramkeyword 关键字

*@returnList : float[0]:pageNum float[1]:x float[2]:y

*@throwsIOException*/

public List findKeywordPostions(byte[] pdfData, String keyword) throwsIOException {

List result = new ArrayList<>();

List pdfPageContentPositions =getPdfContentPostionsList(pdfData);for(PdfPageContentPositions pdfPageContentPosition : pdfPageContentPositions) {

List charPositions =findPositions(keyword, pdfPageContentPosition);if (charPositions == null || charPositions.size() < 1) {continue;

}

result.addAll(charPositions);

}returnresult;

}private List getPdfContentPostionsList(byte[] pdfData) throwsIOException {

PdfReader reader= newPdfReader(pdfData);

List result = new ArrayList<>();int pages =reader.getNumberOfPages();for (int pageNum = 1; pageNum <= pages; pageNum++) {float width =reader.getPageSize(pageNum).getWidth();float height =reader.getPageSize(pageNum).getHeight();

PdfRenderListener pdfRenderListener= newPdfRenderListener(pageNum, width, height);//解析pdf,定位位置

PdfContentStreamProcessor processor = newPdfContentStreamProcessor(pdfRenderListener);

PdfDictionary pageDic=reader.getPageN(pageNum);

PdfDictionary resourcesDic=pageDic.getAsDict(PdfName.RESOURCES);try{

processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNum), resourcesDic);

}catch(IOException e) {

reader.close();throwe;

}

String content=pdfRenderListener.getContent();

List charPositions =pdfRenderListener.getcharPositions();

List positionsList = new ArrayList<>();for(CharPosition charPosition : charPositions) {double[] positions = new double[]{charPosition.getPageNum(), charPosition.getX(), charPosition.getY()};

positionsList.add(positions);

}

PdfPageContentPositions pdfPageContentPositions= newPdfPageContentPositions();

pdfPageContentPositions.setContent(content);

pdfPageContentPositions.setPostions(positionsList);

result.add(pdfPageContentPositions);

}

reader.close();returnresult;

}private static ListfindPositions(String keyword, PdfPageContentPositions pdfPageContentPositions) {

List result = new ArrayList<>();

String content=pdfPageContentPositions.getContent();

List charPositions =pdfPageContentPositions.getPositions();for (int pos = 0; pos

}double[] postions =charPositions.get(positionIndex);

result.add(postions);

pos= positionIndex + 1;

}returnresult;

}private classPdfPageContentPositions {privateString content;private Listpositions;publicString getContent() {returncontent;

}public voidsetContent(String content) {this.content =content;

}public ListgetPositions() {returnpositions;

}public void setPostions(Listpositions) {this.positions =positions;

}

}private class PdfRenderListener implementsRenderListener {private intpageNum;private floatpageWidth;private floatpageHeight;private StringBuilder contentBuilder = newStringBuilder();private List charPositions = new ArrayList<>();public PdfRenderListener(int pageNum, float pageWidth, floatpageHeight) {this.pageNum =pageNum;this.pageWidth =pageWidth;this.pageHeight =pageHeight;

}public voidbeginTextBlock() {

}/*** @Description 计算转换后的 ofd 坐标值

* 如有需要,可转为计算距离原点的百分比值。在知道 ofd 长宽的情况下,用百分比重新计算坐标更精确*/

public voidrenderText(TextRenderInfo renderInfo) {

List characterRenderInfos =renderInfo.getCharacterRenderInfos();for(TextRenderInfo textRenderInfo : characterRenderInfos) {

String word=textRenderInfo.getText();if (word.length() > 1) {

word= word.substring(word.length() - 1, word.length());

}

Float rectangle=textRenderInfo.getAscentLine().getBoundingRectange();float x = (float) rectangle.getX();float y = (float) rectangle.getY();//这两个是关键字在所在页面的XY轴的百分比

float xPercent = Math.round(x / pageWidth * 10000) /10000f;//pdf 原点在左下,ofd 原点在左上

float yPercent = Math.round((1 - y / pageHeight) * 10000) /10000f;

CharPosition charPosition= newCharPosition(pageNum, transForPosition(x),

transForPosition((yPercent)*pageHeight));

charPositions.add(charPosition);

contentBuilder.append(word);

}

}public voidendTextBlock() {

}public voidrenderImage(ImageRenderInfo renderInfo) {

}publicString getContent() {returncontentBuilder.toString();

}public ListgetcharPositions() {returncharPositions;

}

}private classCharPosition {private int pageNum = 0;private double x = 0;private double y = 0;public CharPosition(int pageNum, double x, doubley) {this.pageNum =pageNum;this.x =x;this.y =y;

}public intgetPageNum() {returnpageNum;

}public doublegetX() {returnx;

}public doublegetY() {returny;

}

@OverridepublicString toString() {return "[pageNum=" + this.pageNum + ",x=" + this.x + ",y=" + this.y + "]";

}

}

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值