pdfbox Apache PDFBox是一个开源Java库,支持PDF文档的开发和转换 Apache PDFBox | A Java PDF LibraryThe Apache PDFBox™ library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. Apache PDFBox also includes several command-line utilities. Apache PDFBox is published under the Apache License v2.0.https://pdfbox.apache.org/
pom
pdfbox
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.1</version>
</dependency>
ImgUtil.cut
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.8.0.M1</version>
</dependency>
tess4j
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<version>4.5.2</version>
</dependency>
String imagePath = "D:/1/t.jpg"; //加载pdf文档 PDDocument document = PDDocument.load(new File("d:/1/d.pdf")); //创建PDFRenderer PDFRenderer pdfRenderer = new PDFRenderer(document); //读取第一张图片 BufferedImage bufferedImage = pdfRenderer.renderImage(0); //输出 ImageIO.write(bufferedImage, "JPEG", new File(imagePath)); String imagePath1 = "D:/1/t1.jpg"; //裁剪图片 ImgUtil.cut(FileUtil.file(imagePath), FileUtil.file(imagePath1), new Rectangle(790, 140, 274, 94)); document.close(); // ITesseract instance = new Tesseract(); // 语言库位置(修改为跟自己语言库文件夹的路径) String lagnguagePath = "D:\\tessdata"; instance.setDatapath(lagnguagePath); chi_sim :简体中文, eng 根据需求选择语言库 instance.setLanguage("chi_sim"); instance.setTessVariable("user_defined_dpi", "300"); String result = instance.doOCR(FileUtil.file(imagePath1)); System.out.println(result);