pdfbox获取pdf指定文本附近（可根据距离获取）的图片

qq_64988157

已于 2024-09-03 15:37:34 修改

阅读量286

点赞数 8

文章标签： pdf java windows wps

于 2024-09-03 14:25:52 首次发布

本文链接：https://blog.csdn.net/qq_64988157/article/details/141860758

版权

引入依赖

      <dependency>
           <groupId>org.apache.pdfbox</groupId>
           <artifactId>pdfbox</artifactId>
           <version>2.0.29</version>
       </dependency>
       <dependency>
           <groupId>com.github.jai-imageio</groupId>
           <artifactId>jai-imageio-core</artifactId>
           <version>1.4.0</version>
       </dependency>
       <dependency>
           <groupId>com.github.jai-imageio</groupId>
           <artifactId>jai-imageio-jpeg2000</artifactId>
           <version>1.3.0</version>
       </dependency>
       <!-- https://mvnrepository.com/artifact/net.sf.cssbox/pdf2dom -->
       <dependency>
           <groupId>net.sf.cssbox</groupId>
           <artifactId>pdf2dom</artifactId>
           <version>2.0.3</version>
       </dependency>

public class MyPdf extends PDFDomTree {
    private final Map<Integer, Point> textPositions = new TreeMap<>(); // 存储文本坐标
    private final List<ImageResource> nearestImages = new ArrayList<>(); // 存储图片
    private final String searchText = "--"; // 要搜索的文本
    private static PDDocument document;
    public MyPdf() throws IOException {
        super();
    }

    protected void startNewPage(){
        System.out.println("====页码:" + pagecnt);
        super.startNewPage();
    }


    @Override
    protected void renderText(String data, TextMetrics metrics)
    {
        if(data.contains(searchText)){
            System.out.println("====文本:" + data + "," +  ",x:" + (int)metrics.getX() + ",top:" + (int)metrics.getTop() + ",width:" + (int)metrics.getWidth() + ",height:" + (int)metrics.getHeight() );
            textPositions.put(pagecnt,new Point((int)metrics.getX(),(int)metrics.getTop()));
        }
        curpage.appendChild(createTextElement(data, metrics.getWidth()));
    }

//    @Override
//    protected void renderPath(List<PathSegment> path, boolean stroke, boolean fill) throws IOException
//    {
//        PathSegment path1 = path.get(0);
//        System.out.println("====路径1:" + "x1:" + path.get(0).getX1() + ",y1:" + path1.getY1() + ",x2:" + path1.getX2() + ",y2:" + path1.getY2() + ",stroke:" + stroke + ",fill:" + fill);
//        super.renderPath(path, stroke, fill);
//    }

    @Override
    protected void renderImage(float x, float y, float width, float height, ImageResource resource) throws IOException
    {
        System.out.println("====图片:" + "x:" + x + ",y:" + y + ",width:" + width + ",height:" + height);
        curpage.appendChild(createImageElement(x, y, width, height, resource));
        if(textPositions.containsKey(pagecnt)){
            Point point = textPositions.get(pagecnt);
            float textY = point.getY();
            float textX = point.getX();
            float distance = (float) new Point2D.Float(textX, textY).distance(new Point2D.Float(x, y));
            if (distance < 100) {
                nearestImages.add(resource);
            }
        }
    }

    public void parsePdf(PDDocument doc){
        try
        {
            DOMImplementationRegistry registry = DOMImplementationRegistry.newInstance();
            DOMImplementationLS impl = (DOMImplementationLS)registry.getDOMImplementation("LS");
            LSSerializer writer = impl.createLSSerializer();
            LSOutput output = impl.createLSOutput();
            writer.getDomConfig().setParameter("format-pretty-print", true);
            createDOM(doc);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    public static void main(String[] args) {
        try {
            File pdfFile = new File("src/main/resources/2.pdf");
            PDDocument document = PDDocument.load(pdfFile);
            MyPdf pdfDomTree = new MyPdf();
            pdfDomTree.parsePdf(document);
            for (int i = 0; i < pdfDomTree.nearestImages.size(); i++) {
                ImageResource image = pdfDomTree.nearestImages.get(i);
                byte[] imageData = image.getData();
                File outputFile = new File("src/main/resources/image" + (i + 1) + ".png"); // 保存图片的路径
                ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(imageData);
                BufferedImage image1 = ImageIO.read(byteArrayInputStream);
                ImageIO.write(image1,"PNG",outputFile);
                System.out.println("Nearest image " + (i + 1) + " saved at: " + outputFile.getAbsolutePath());
            }

        }
        catch(Exception e){
            e.printStackTrace();
        }
    }
}
 class Point {
    private final float x;
    private final float y;

    public Point(float x, float y) {
        this.x = x;
        this.y = y;
    }

    public float getX() {
        return x;
    }

    public float getY() {
        return y;
    }
}