/**
* 提取指定PDF页面的图片转换为Base64的List
* 注意:图片提取的顺序是PDF创建时图片插入的顺序
* @param file pdf文件
* @param startIndex
* @param endIndex
* @throws Exception
*/
public static List<String> getbase64Photos(File file, int startIndex, int endIndex) {
List<String> photos = new ArrayList<>();
try (PDDocument document = PDDocument.load(file)) {
//TODO 下标从0开始,所以-1
for (int i = startIndex - 1; i < endIndex; i++) {
PDPage pdfpage = document.getPage(i);
// get resource of pdf
PDResources pdResources = pdfpage.getResources();
Iterable<COSName> xObjectNames = pdResources.getXObjectNames();
Iterator<COSName> iterator = xObjectNames.iterator();
while (iterator.hasNext()) {
PDXObject o = pdResources.getXObject(iterator.next());
if (o instanceof PDImageXObject) {
//得到BufferedImage对象
BufferedImage image = ((PDImageXObject) o).getImage();
String base64img = Base64Util.convertimgtoBase64(image);
// 可以打印到本地,查看输出顺序
//String imglocation = "C:\\CER\\AE EMC lab_Report template\\pdf img by page\\";
//File imgfile = new File(imglocation + StringUtil.get32UUID() + ".png");
//ImageIO.write(image, "png", imgfile);
photos.add("data:image/jpg;base64," + base64img);
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
return photos;
}
PdfBox提取图片的顺序是根据,插入图片的先后来提取的。
public class Base64Util {
public static String convertimgtoBase64(BufferedImage image) {
String png_base64 = "";
try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
ImageIO.write(image, "png", baos);//写入流中
byte[] bytes = baos.toByteArray();//转换成字节
// png_base64 = new BASE64Encoder().encode(bytes);//jdk1.8写法
png_base64 = Base64.encodeBase64String(bytes);//JDK11写法
// String png_base64 = Base64.encodeBase64String(bytes).trim();//转换成base64串
png_base64 = png_base64.replaceAll("\n", "").replaceAll("\r", "");//删除 \r\n
} catch (IOException e) {
e.printStackTrace();
}
return png_base64;
}
}