<!-- 依赖 -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.23</version>
</dependency>
获取传入的pdf的图片
我这里是分页读取的,更灵活
public static void main(String[] args) throws IOException {
Integer count = 0;
File file = new File("D:\\Data\\电子图书馆_使用文档.pdf");
FileInputStream fis = new FileInputStream(file);
PDDocument document = PDDocument.load(fis);
int allPages = document.getNumberOfPages();
for (int i = 0; i < allPages; i++) {
PDPage page = document.getPage(i);
PDResources resources = page.getResources();
Iterable<COSName> xObjectNames = resources.getXObjectNames();
if (xObjectNames != null){
Iterator<COSName> iterator = xObjectNames.iterator();
while (iterator.hasNext()){
COSName key = iterator.next();
if (resources.isImageXObject(key)){
PDImageXObject image = (PDImageXObject) resources.getXObject(key);
BufferedImage bImage = image.getImage();
ImageIO.write(bImage, "PNG", new File("D:\\image\\"+"image_"+ (i+1) + "页" + count + ".png"));
count++;
}
}
}
}
document.close();
}
获取传入的pdf的文字内容
同样的分页读取
public static void main(String[] args) throws IOException {
PDDocument doc = new PDDocument();
File file = new File("D:\\Data\\Java课件\\xxx.pdf");
FileInputStream fis = new FileInputStream(file);
doc = PDDocument.load(fis);
PDFTextStripper pdfStripper = new PDFTextStripper();
Splitter splitter = new Splitter();
List<PDDocument> split = splitter.split(doc);
for (int i = 0; i < split.size(); i++) {
doc = split.get(i);
PDFRenderer pdfRenderer = new PDFRenderer(doc);
String text = pdfStripper.getText(doc);
System.out.println("第"+(i+1)+"页内容:"+text);
}
doc.close();
}