Apache PDFbox是一个开源的、基于Java的、支持PDF文档生成的工具库,它可以用于创建新的PDF文档,修改现有的PDF文档,还可以从PDF文档中提取所需的内容。Apache PDFBox还包含了数个命令行工具。
Apache PDFbox于2016年4月26日发布了最新的2.0.1版。
备注:本文代码均是基于2.0及以上版本编写。
官网地址:https://pdfbox.apache.org/index.html
PDFBox 2.0.1 API在线文档:https://pdfbox.apache.org/docs/2.0.1/javadocs/
1,。JAR包
pdfbox-2.0.1.jar下载地址
fontbox-2.0.1.jar下载地址
访问网址 http://sourceforge.net/projects/pdfbox/ 。
package com.airport.demo.tcpReplace; import java.awt.image.BufferedImage; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.logging.Logger; import javax.imageio.ImageIO; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentCatalog; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageContentStream; import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.text.PDFTextStripper; import com.airport.demo.bean.MRZData; import com.jogamp.opengl.util.packrect.Level; public class PDFUtil { public static MRZData readPDF() { MRZData mData = new MRZData(); File pdfFile = new File("C:\\Users\\weidx\\Documents\\My Access-IS Data\\PDFs\\1.pdf"); PDDocument document = null; try { // 方式一: // InputStream input = null; // input = new FileInputStream(pdfFile); // // 加载 pdf 文档 // PDFParser parser = new PDFParser(input, new // RandomAccessBuffer()); // parser.parse(); // document = parser.getPDDocument(); // 方式二: document = PDDocument.load(pdfFile); // 获取页码 int pages = document.getNumberOfPages(); // 读文本内容 PDFTextStripper stripper = new PDFTextStripper(); // 设置按顺序输出 stripper.setSortByPosition(true); stripper.setStartPage(1); stripper.setEndPage(1); String content = stripper.getText(document); System.out.println(content); document.close(); } catch (Exception e) { System.out.println(e); } return mData; } public static void main(String[] args) { System.out.println(readPDF()); String f="C:\\Users\\weidx\\Documents\\My Access-IS Data\\PDFs\\1.pdf"; String path="C:\\Users\\weidx\\Desktop\\"; pdfSaveImage(f,path); } public static void pdfSaveImage(String file, String imgSavePath) { try { // 打开pdf文件流 FileInputStream fis = new FileInputStream(file); // 加载 pdf 文档,获取PDDocument文档对象 PDDocument document = PDDocument.load(fis); /** 文档页面信息 **/// 获取PDDocumentCatalog文档目录对象 PDDocumentCatalog catalog = document.getDocumentCatalog(); // 获取文档页面PDPage列表 int pages = document.getNumberOfPages(); int count = 1; for (int j = 1; j < pages; j++) { PDPage page = document.getPage(j); PDResources resources = page.getResources(); Iterable xobjects = resources.getXObjectNames(); if (xobjects != null) { Iterator imageIter = xobjects.iterator(); while (imageIter.hasNext()) { COSName key = (COSName) imageIter.next(); if (resources.isImageXObject(key)) { try { PDImageXObject image = (PDImageXObject) resources.getXObject(key); BufferedImage bimage = image.getImage(); ImageIO.write(bimage, "jpg", new File(imgSavePath + count + ".jpg")); count++; System.out.println(count); } catch (Exception e) { } } } } } // document.close(); // fis.close(); } catch (Exception e) { System.out.println(); } } public static void readImage() { // 待解析PDF File pdfFile = new File("C:\\Users\\weidx\\Documents\\My Access-IS Data\\PDFs\\in.pdf"); // 空白PDF File pdfFile_out = new File("C:\\Users\\weidx\\Documents\\My Access-IS Data\\PDFs\\out.pdf"); PDDocument document = null; PDDocument document_out = null; try { document = PDDocument.load(pdfFile); document_out = PDDocument.load(pdfFile_out); } catch (IOException e) { e.printStackTrace(); } int pages_size = document.getNumberOfPages(); System.out.println("getAllPages===============" + pages_size); int j = 0; for (int i = 0; i < pages_size; i++) { PDPage page = document.getPage(i); PDPage page1 = document_out.getPage(0); PDResources resources = page.getResources(); Iterable xobjects = resources.getXObjectNames(); if (xobjects != null) { Iterator imageIter = xobjects.iterator(); while (imageIter.hasNext()) { COSName key = (COSName) imageIter.next(); if (resources.isImageXObject(key)) { try { PDImageXObject image = (PDImageXObject) resources.getXObject(key); // 方式一:将PDF文档中的图片 分别存到一个空白PDF中。 PDPageContentStream contentStream = new PDPageContentStream(document_out, page1, AppendMode.APPEND, true); float scale = 1f; contentStream.drawImage(image, 20, 20, image.getWidth() * scale, image.getHeight() * scale); contentStream.close(); document_out.save("C:\\Users\\weidx\\Documents\\My Access-IS Data\\PDFs\\" + j + ".pdf"); System.out.println(image.getSuffix() + "," + image.getHeight() + "," + image.getWidth()); /** * // 方式二:将PDF文档中的图片 分别另存为图片。 File file = new * File("/Users/xiaolong/Downloads/123"+j+".png"); * FileOutputStream out = new * FileOutputStream(file); * * InputStream input = image.createInputStream(); * * int byteCount = 0; byte[] bytes = new byte[1024]; * * while ((byteCount = input.read(bytes)) > 0) { * out.write(bytes,0,byteCount); } * * out.close(); input.close(); **/ } catch (IOException e) { e.printStackTrace(); } j++; } } } } System.out.println(j); } }
提供另外一种思路
- public static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
- /**
- * 解析pdf文档信息
- * @param pdfPath pdf文档路径
- * @throws Exception
- */
- public static void pdfParse( String pdfPath, String imgSavePath ) throws Exception
- {
- InputStream input = null;
- File pdfFile = new File( pdfPath );
- PDDocument document = null;
- try{
- input = new FileInputStream( pdfFile );
- //加载 pdf 文档
- document = PDDocument.load( input );
- /** 文档属性信息 **/
- PDDocumentInformation info = document.getDocumentInformation();
- System.out.println( "标题:" + info.getTitle() );
- System.out.println( "主题:" + info.getSubject() );
- System.out.println( "作者:" + info.getAuthor() );
- System.out.println( "关键字:" + info.getKeywords() );
- System.out.println( "应用程序:" + info.getCreator() );
- System.out.println( "pdf 制作程序:" + info.getProducer() );
- System.out.println( "作者:" + info.getTrapped() );
- System.out.println( "创建时间:" + dateFormat( info.getCreationDate() ));
- System.out.println( "修改时间:" + dateFormat( info.getModificationDate()));
- /** 文档页面信息 **/
- PDDocumentCatalog cata = document.getDocumentCatalog();
- List pages = cata.getAllPages();
- int count = 1;
- for( int i = 0; i < pages.size(); i++ )
- {
- PDPage page = ( PDPage ) pages.get( i );
- if( null != page )
- {
- PDResources res = page.findResources();
- //获取页面图片信息
- Map imgs = res.getImages();
- if( null != imgs )
- {
- Set keySet = imgs.keySet();
- Iterator it = keySet.iterator();
- while( it.hasNext() )
- {
- Object obj = it.next();
- PDXObjectImage img = ( PDXObjectImage ) imgs.get( obj );
- img.write2file( imgSavePath + count );
- count++;
- }
- }
- }
- }
- }catch( Exception e)
- {
- throw e;
- }finally{
- if( null != input )
- input.close();
- if( null != document )
- document.close();
- }
- }
- /**
- * 获取格式化后的时间信息
- * @param dar 时间信息
- * @return
- * @throws Exception
- */
- public static String dateFormat( Calendar calendar ) throws Exception
- {
- if( null == calendar )
- return null;
- String date = null;
- try{
- String pattern = DATE_FORMAT;
- SimpleDateFormat format = new SimpleDateFormat( pattern );
- date = format.format( calendar.getTime() );
- }catch( Exception e )
- {
- throw e;
- }
- return date == null ? "" : date;
- }
以下是其他的操作,仅供参考[java] view plain copy print? import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.List; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentCatalog; import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDResources; import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; import org.apache.pdfbox.util.PDFTextStripper; importstatic readPDFContent.PDFParse.dateFormat; /** * * @author Angela */publicclassPDFReader {/** * 获取格式化后的时间信息 * @param calendar 时间信息 * @return */publicstatic String dateFormat( Calendar calendar ){ if( null == calendar ) returnnull; String date = null; String pattern = "yyyy-MM-dd HH:mm:ss"; SimpleDateFormat format = new SimpleDateFormat( pattern ); date = format.format( calendar.getTime() ); return date == null ? "" : date; } /**打印纲要**/publicstaticvoidgetPDFOutline(String file){ try { //打开pdf文件流 FileInputStream fis = new FileInputStream(file); //加载 pdf 文档,获取PDDocument文档对象 PDDocument document=PDDocument.load(fis); //获取PDDocumentCatalog文档目录对象 PDDocumentCatalog catalog=document.getDocumentCatalog(); //获取PDDocumentOutline文档纲要对象 PDDocumentOutline outline=catalog.getDocumentOutline(); //获取第一个纲要条目(标题1) PDOutlineItem item=outline.getFirstChild(); if(outline!=null){ //遍历每一个标题1while(item!=null){ //打印标题1的文本 System.out.println("Item:"+item.getTitle()); //获取标题1下的第一个子标题(标题2) PDOutlineItem child=item.getFirstChild(); //遍历每一个标题2while(child!=null){ //打印标题2的文本 System.out.println(" Child:"+child.getTitle()); //指向下一个标题2 child=child.getNextSibling(); } //指向下一个标题1 item=item.getNextSibling(); } } //关闭输入流 document.close(); fis.close(); } catch (FileNotFoundException ex) { Logger.getLogger(PDFBOXReader.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(PDFBOXReader.class.getName()).log(Level.SEVERE, null, ex); } } /**打印一级目录**/publicstaticvoidgetPDFCatalog(String file){ try { //打开pdf文件流 FileInputStream fis = new FileInputStream(file); //加载 pdf 文档,获取PDDocument文档对象 PDDocument document=PDDocument.load(fis); //获取PDDocumentCatalog文档目录对象 PDDocumentCatalog catalog=document.getDocumentCatalog(); //获取PDDocumentOutline文档纲要对象 PDDocumentOutline outline=catalog.getDocumentOutline(); //获取第一个纲要条目(标题1)if(outline!=null){ PDOutlineItem item=outline.getFirstChild(); //遍历每一个标题1while(item!=null){ //打印标题1的文本 System.out.println("Item:"+item.getTitle()); //指向下一个标题1 item=item.getNextSibling(); } } //关闭输入流 document.close(); fis.close(); } catch (FileNotFoundException ex) { Logger.getLogger(PDFBOXReader.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(PDFBOXReader.class.getName()).log(Level.SEVERE, null, ex); } } /**获取PDF文档元数据**/publicstaticvoidgetPDFInformation(String file){ try { //打开pdf文件流 FileInputStream fis = new FileInputStream(file); //加载 pdf 文档,获取PDDocument文档对象 PDDocument document=PDDocument.load(fis); /** 文档属性信息 **/ PDDocumentInformation info = document.getDocumentInformation(); System.out.println("页数:"+document.getNumberOfPages()); System.out.println( "标题:" + info.getTitle() ); System.out.println( "主题:" + info.getSubject() ); System.out.println( "作者:" + info.getAuthor() ); System.out.println( "关键字:" + info.getKeywords() ); System.out.println( "应用程序:" + info.getCreator() ); System.out.println( "pdf 制作程序:" + info.getProducer() ); System.out.println( "Trapped:" + info.getTrapped() ); System.out.println( "创建时间:" + dateFormat( info.getCreationDate() )); System.out.println( "修改时间:" + dateFormat( info.getModificationDate())); //关闭输入流 document.close(); fis.close(); } catch (FileNotFoundException ex) { Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex); } } /**提取pdf文本**/publicstaticvoidextractTXT(String file){ try{ //打开pdf文件流 FileInputStream fis = new FileInputStream(file); //实例化一个PDF解析器 PDFParser parser = new PDFParser(fis); //解析pdf文档 parser.parse(); //获取PDDocument文档对象 PDDocument document=parser.getPDDocument(); //获取一个PDFTextStripper文本剥离对象 PDFTextStripper stripper = new PDFTextStripper(); //获取文本内容 String content = stripper.getText(document); //打印内容 System.out.println( "内容:" + content ); document.close(); fis.close(); } catch (FileNotFoundException ex) { Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex); } } /** * 提取部分页面文本 * @param file pdf文档路径 * @param startPage 开始页数 * @param endPage 结束页数 */publicstaticvoidextractTXT(String file,int startPage,int endPage){ try{ //打开pdf文件流 FileInputStream fis = new FileInputStream(file); //实例化一个PDF解析器 PDFParser parser = new PDFParser(fis); //解析pdf文档 parser.parse(); //获取PDDocument文档对象 PDDocument document=parser.getPDDocument(); //获取一个PDFTextStripper文本剥离对象 PDFTextStripper stripper = new PDFTextStripper(); // 设置起始页 stripper.setStartPage(startPage); // 设置结束页 stripper.setEndPage(endPage); //获取文本内容 String content = stripper.getText(document); //打印内容 System.out.println( "内容:" + content ); document.close(); fis.close(); } catch (FileNotFoundException ex) { Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex); } } /** * 提取图片并保存 * @param file PDF文档路径 * @param imgSavePath 图片保存路径 */publicstaticvoidextractImage(String file,String imgSavePath){ try{ //打开pdf文件流 FileInputStream fis = new FileInputStream(file); //加载 pdf 文档,获取PDDocument文档对象 PDDocument document=PDDocument.load(fis); /** 文档页面信息 **///获取PDDocumentCatalog文档目录对象 PDDocumentCatalog catalog = document.getDocumentCatalog(); //获取文档页面PDPage列表 List pages = catalog.getAllPages(); int count = 1; int pageNum=pages.size(); //文档页数//遍历每一页for( int i = 0; i < pageNum; i++ ){ //取得第i页 PDPage page = ( PDPage ) pages.get( i ); if( null != page ){ PDResources resource = page.findResources(); //获取页面图片信息 Map<String,PDXObjectImage> imgs = resource.getImages(); for(Map.Entry<String,PDXObjectImage> me: imgs.entrySet()){ //System.out.println(me.getKey()); PDXObjectImage img = me.getValue(); //保存图片,会自动添加图片后缀类型 img.write2file( imgSavePath + count ); count++; } } } document.close(); fis.close(); } catch (FileNotFoundException ex) { Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex); } } /** * 提取文本并保存 * @param file PDF文档路径 * @param savePath 文本保存路径 */publicstaticvoidextractTXT(String file,String savePath){ try{ //打开pdf文件流 FileInputStream fis = new FileInputStream(file); //实例化一个PDF解析器 PDFParser parser = new PDFParser(fis); //解析pdf文档 parser.parse(); //获取PDDocument文档对象 PDDocument document=parser.getPDDocument(); //获取一个PDFTextStripper文本剥离对象 PDFTextStripper stripper = new PDFTextStripper(); //创建一个输出流 Writer writer=new OutputStreamWriter(new FileOutputStream(savePath)); //保存文本内容 stripper.writeText(document, writer); //关闭输出流 writer.close(); //关闭输入流 document.close(); fis.close(); } catch (FileNotFoundException ex) { Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex); } } /** * 提取部分页面文本并保存 * @param file PDF文档路径 * @param startPage 开始页数 * @param endPage 结束页数 * @param savePath 文本保存路径 */publicstaticvoidextractTXT(String file,int startPage, int endPage,String savePath){ try{ //打开pdf文件流 FileInputStream fis = new FileInputStream(file); //实例化一个PDF解析器 PDFParser parser = new PDFParser(fis); //解析pdf文档 parser.parse(); //获取PDDocument文档对象 PDDocument document=parser.getPDDocument(); //获取一个PDFTextStripper文本剥离对象 PDFTextStripper stripper = new PDFTextStripper(); //创建一个输出流 Writer writer=new OutputStreamWriter(new FileOutputStream(savePath)); // 设置起始页 stripper.setStartPage(startPage); // 设置结束页 stripper.setEndPage(endPage); //保存文本内容 stripper.writeText(document, writer); //关闭输出流 writer.close(); //关闭输入流 document.close(); fis.close(); } catch (FileNotFoundException ex) { Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex); } } publicstaticvoidmain(String args[]){ String file="F:\\pdf\\2013\\000608_阳光股份_2013年年度报告(更新后)_1.pdf"; String savePath="E:\\result1.txt"; long startTime=System.currentTimeMillis(); extractTXT(file,savePath); long endTime=System.currentTimeMillis(); System.out.println("读写所用时间为:"+(endTime-startTime)+"ms"); } }