java实现查找PDF关键字所在页码及其坐标

根据关键字返回在pdf中的xy坐标和页码

maven引用:

<dependency>
            <groupId>com.itextpdf</groupId>
            <artifactId>itextpdf</artifactId>
            <version>5.5.1</version>
</dependency>

import com.itextpdf.awt.geom.Rectangle2D;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.*;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;


public class PdfUtils {

    /**
     * findKeywordPostions
     * @param pdfData     通过IO流 PDF文件转化的byte数组
     * @param keyword     关键字
     * @return List<float [ ]> : float[0]:pageNum float[1]:x float[2]:y
     * @throws IOException
     */
    public static List<float[]> findKeywordPostions(byte[] pdfData, String keyword) throws IOException {
        List<float[]> result = new ArrayList<>();
        List<PdfUtils.PdfPageContentPositions> pdfPageContentPositions = getPdfContentPostionsList(pdfData);


        for (PdfUtils.PdfPageContentPositions pdfPageContentPosition : pdfPageContentPositions) {
            List<float[]> charPositions = findPositions(keyword, pdfPageContentPosition);
            if (charPositions == null || charPositions.size() < 1) {
                continue;
            }
            result.addAll(charPositions);
        }
        return result;
    }


    private static List<PdfUtils.PdfPageContentPositions> getPdfContentPostionsList(byte[] pdfData) throws IOException {
        PdfReader reader = new PdfReader(pdfData);


        List<PdfUtils.PdfPageContentPositions> result = new ArrayList<>();


        int pages = reader.getNumberOfPages();
        for (int pageNum = 1; pageNum <= pages; pageNum++) {
            float width = reader.getPageSize(pageNum).getWidth();
            float height = reader.getPageSize(pageNum).getHeight();


            PdfUtils.PdfRenderListener pdfRenderListener = new PdfUtils.PdfRenderListener(pageNum, width, height);


            //解析pdf,定位位置
            PdfContentStreamProcessor processor = new PdfContentStreamProcessor(pdfRenderListener);
            PdfDictionary pageDic = reader.getPageN(pageNum);
            PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES);
            try {
                processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNum), resourcesDic);
            } catch (IOException e) {
                reader.close();
                throw e;
            }


            String content = pdfRenderListener.getContent();
            List<PdfUtils.CharPosition> charPositions = pdfRenderListener.getcharPositions();


            List<float[]> positionsList = new ArrayList<>();
            for (PdfUtils.CharPosition charPosition : charPositions) {
                float[] positions = new float[]{charPosition.getPageNum(), charPosition.getX(), charPosition.getY()};
                positionsList.add(positions);
            }


            PdfUtils.PdfPageContentPositions pdfPageContentPositions = new PdfUtils.PdfPageContentPositions();
            pdfPageContentPositions.setContent(content);
            pdfPageContentPositions.setPostions(positionsList);


            result.add(pdfPageContentPositions);
        }
        reader.close();
        return result;
    }


    private static List<float[]> findPositions(String keyword, PdfUtils.PdfPageContentPositions pdfPageContentPositions) {


        List<float[]> result = new ArrayList<>();


        String content = pdfPageContentPositions.getContent();
        List<float[]> charPositions = pdfPageContentPositions.getPositions();


        for (int pos = 0; pos < content.length(); ) {
            int positionIndex = content.indexOf(keyword, pos);
            if (positionIndex == -1) {
                break;
            }
            float[] postions = charPositions.get(positionIndex);
            result.add(postions);
            pos = positionIndex + 1;
        }
        return result;
    }


    private static class PdfPageContentPositions {
        private String content;
        private List<float[]> positions;


        public String getContent() {
            return content;
        }


        public void setContent(String content) {
            this.content = content;
        }


        public List<float[]> getPositions() {
            return positions;
        }


        public void setPostions(List<float[]> positions) {
            this.positions = positions;
        }
    }



    private static class PdfRenderListener implements RenderListener {
        private int pageNum;
        private float pageWidth;
        private float pageHeight;
        private StringBuilder contentBuilder = new StringBuilder();
        private List<PdfUtils.CharPosition> charPositions = new ArrayList<>();


        public PdfRenderListener(int pageNum, float pageWidth, float pageHeight) {
            this.pageNum = pageNum;
            this.pageWidth = pageWidth;
            this.pageHeight = pageHeight;
        }


        public void beginTextBlock() {
        }


        public void renderText(TextRenderInfo renderInfo) {
            List<TextRenderInfo> characterRenderInfos = renderInfo.getCharacterRenderInfos();
            for (TextRenderInfo textRenderInfo : characterRenderInfos) {
                String word = textRenderInfo.getText();
                if (word.length() > 1) {
                    word = word.substring(word.length() - 1, word.length());
                }
                Rectangle2D.Float rectangle = textRenderInfo.getAscentLine().getBoundingRectange();

                float x = (float)rectangle.getX();
                float y = (float)rectangle.getY();
//                float x = (float)rectangle.getCenterX();
//                float y = (float)rectangle.getCenterY();
//                double x = rectangle.getMinX();
//                double y = rectangle.getMaxY();




                //这两个是关键字在所在页面的XY轴的百分比
                float xPercent = Math.round(x / pageWidth * 10000) / 10000f;
                float yPercent = Math.round((1 - y / pageHeight) * 10000) / 10000f;


//                CharPosition charPosition = new CharPosition(pageNum, xPercent, yPercent);
                PdfUtils.CharPosition charPosition = new PdfUtils.CharPosition(pageNum, (float)x, (float)y);
                charPositions.add(charPosition);
                contentBuilder.append(word);
            }
        }


        public void endTextBlock() {
        }


        public void renderImage(ImageRenderInfo renderInfo) {
        }


        public String getContent() {
            return contentBuilder.toString();
        }


        public List<PdfUtils.CharPosition> getcharPositions() {
            return charPositions;
        }
    }


    private static class CharPosition {
        private int pageNum = 0;
        private float x = 0;
        private float y = 0;


        public CharPosition(int pageNum, float x, float y) {
            this.pageNum = pageNum;
            this.x = x;
            this.y = y;
        }


        public int getPageNum() {
            return pageNum;
        }


        public float getX() {
            return x;
        }


        public float getY() {
            return y;
        }

    }


    public static void main(String[] args) throws IOException {
        //1.给定文件
        File pdfFile = new File("D://word//rgxy.pdf");
        //2.定义一个byte数组,长度为文件的长度
        byte[] pdfData = new byte[(int) pdfFile.length()];

        //3.IO流读取文件内容到byte数组
        FileInputStream inputStream = null;
        try {
            inputStream = new FileInputStream(pdfFile);
            inputStream.read(pdfData);
        } catch (IOException e) {
            throw e;
        } finally {
            if (inputStream != null) {
                try {
                    inputStream.close();
                } catch (IOException e) {
                }
            }
        }

        //4.指定关键字
        String keyword = "签名:";

        //5.调用方法,给定关键字和文件
        List<float[]> positions = findKeywordPostions(pdfData, keyword);

        //6.返回值类型是  List<float[]> 每个list元素代表一个匹配的位置,分别为 float[0]所在页码  float[1]所在x轴 float[2]所在y轴
        System.out.println("total:" + positions.size());
        if (positions != null && positions.size() > 0) {
            for (float[] position : positions) {
                System.out.print("pageNum: " + (int) position[0]);
                System.out.print("\tx: " + position[1]);
                System.out.println("\ty: " + position[2]);
            }
        }
    }
    
}

转自:https://www.cnblogs.com/xsdty/p/11463174.html

  • 1
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
要在Java实现PDF点击链接跳转到指定页码,可以使用iText库。iText是一个开源的Java PDF库,可以用来创建、编辑和操作PDF文档。 下面是一个示例代码,展示如何在PDF文档中创建一个跳转链接,点击链接后跳转到第5页: ```java import com.itextpdf.kernel.pdf.PdfDocument; import com.itextpdf.kernel.pdf.PdfDestination; import com.itextpdf.kernel.pdf.PdfPage; import com.itextpdf.kernel.pdf.action.PdfAction; import com.itextpdf.kernel.pdf.navigation.PdfExplicitDestination; // 打开PDF文档 PdfDocument pdfDoc = new PdfDocument(new PdfReader("example.pdf"), new PdfWriter("output.pdf")); // 获取第5页 PdfPage page = pdfDoc.getPage(4); // 创建PdfDestination对象 PdfDestination dest = PdfExplicitDestination.createFitH(page, page.getPageSize().getHeight()); // 创建PdfAction对象,指向PdfDestination PdfAction action = PdfAction.createGoTo(dest); // 在文本中创建超链接,指向PdfAction对象 Paragraph p = new Paragraph("Click here to go to page 5"); Link link = new Link(" ", action); link.setUnderline(false); p.add(link); // 将文本添加到PDF文档中 Document doc = new Document(pdfDoc); doc.add(p); // 关闭文档 doc.close(); ``` 在上述代码中,首先打开了一个名为example.pdfPDF文档,并获取了第5页的页面对象。然后,使用PdfExplicitDestination类创建了一个指向第5页的PdfDestination对象dest。接着,使用PdfAction类创建了一个指向dest对象的PdfAction对象action。最后,在文本中创建了一个超链接,指向action对象。 需要注意的是,创建PdfExplicitDestination对象时,需要指定目标页面和定位类型。定位类型可以是createFitH、createFitV、createFitR等等,可以根据需要选择不同的定位类型。如果需要指定更详细的目标位置,可以在定位类型后面添加参数,如createFitH(page, 300)表示距离页面顶部300个单位的位置
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值