PDF读取框架pdfbox 图片读取和存储以及创建新pdf

最新推荐文章于 2022-04-25 11:02:54 发布
暮鼓晨钟nbv
最新推荐文章于 2022-04-25 11:02:54 发布
阅读量4.1k
点赞数 3
分类专栏：文档操作文章标签：框架开源 pdf
本文链接：https://blog.csdn.net/qq_29048719/article/details/76474703
版权
文档操作专栏收录该内容
1 篇文章 0 订阅
订阅专栏
Apache PDFbox是一个开源的、基于Java的、支持PDF文档生成的工具库，它可以用于创建新的PDF文档，修改现有的PDF文档，还可以从PDF文档中提取所需的内容。Apache PDFBox还包含了数个命令行工具。
Apache PDFbox于2016年4月26日发布了最新的2.0.1版。
备注：本文代码均是基于2.0及以上版本编写。
官网地址：https://pdfbox.apache.org/index.html
PDFBox 2.0.1 API在线文档：https://pdfbox.apache.org/docs/2.0.1/javadocs/
1,。JAR包
pdfbox-2.0.1.jar下载地址
fontbox-2.0.1.jar下载地址
访问网址 http://sourceforge.net/projects/pdfbox/ 。
package com.airport.demo.tcpReplace;

import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;

import javax.imageio.ImageIO;

import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.PDPageContentStream.AppendMode;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;

import com.airport.demo.bean.MRZData;
import com.jogamp.opengl.util.packrect.Level;

public class PDFUtil {

	public static MRZData readPDF() {
		MRZData mData = new MRZData();
		File pdfFile = new File("C:\\Users\\weidx\\Documents\\My Access-IS Data\\PDFs\\1.pdf");
		PDDocument document = null;
		try {
			// 方式一：
			// InputStream input = null;
			// input = new FileInputStream(pdfFile);
			// // 加载 pdf 文档
			// PDFParser parser = new PDFParser(input, new
			// RandomAccessBuffer());
			// parser.parse();
			// document = parser.getPDDocument();
			// 方式二：
			document = PDDocument.load(pdfFile);
			// 获取页码
			int pages = document.getNumberOfPages();
			// 读文本内容
			PDFTextStripper stripper = new PDFTextStripper();
			// 设置按顺序输出
			stripper.setSortByPosition(true);
			stripper.setStartPage(1);
			stripper.setEndPage(1);
			String content = stripper.getText(document);
			System.out.println(content);
			document.close();
		} catch (Exception e) {
			System.out.println(e);
		}
		return mData;
	}

	public static void main(String[] args) {

		System.out.println(readPDF());
		String f="C:\\Users\\weidx\\Documents\\My Access-IS Data\\PDFs\\1.pdf";
		String path="C:\\Users\\weidx\\Desktop\\";
		pdfSaveImage(f,path);
	}

	public static void pdfSaveImage(String file, String imgSavePath) {
		try {
			// 打开pdf文件流
			FileInputStream fis = new FileInputStream(file);
			// 加载 pdf 文档,获取PDDocument文档对象
			PDDocument document = PDDocument.load(fis);
			/** 文档页面信息 **/// 获取PDDocumentCatalog文档目录对象
			PDDocumentCatalog catalog = document.getDocumentCatalog();
			// 获取文档页面PDPage列表
			int pages = document.getNumberOfPages();
			int count = 1;
			for (int j = 1; j < pages; j++) {
				PDPage page = document.getPage(j);
				PDResources resources = page.getResources();
				Iterable xobjects = resources.getXObjectNames();
				if (xobjects != null) {
					Iterator imageIter = xobjects.iterator();
					while (imageIter.hasNext()) {
						COSName key = (COSName) imageIter.next();
						if (resources.isImageXObject(key)) {
							try {
								PDImageXObject image = (PDImageXObject) resources.getXObject(key);
								BufferedImage bimage = image.getImage();
								ImageIO.write(bimage, "jpg", new File(imgSavePath + count + ".jpg"));
								count++;
								System.out.println(count);
							} catch (Exception e) {
							}
						}

					}
				}
			}
//			document.close();
//			fis.close();

		} catch (Exception e) {
			System.out.println();
		}
		}

		

	public static void readImage() {

		// 待解析PDF
		File pdfFile = new File("C:\\Users\\weidx\\Documents\\My Access-IS Data\\PDFs\\in.pdf");
		// 空白PDF
		File pdfFile_out = new File("C:\\Users\\weidx\\Documents\\My Access-IS Data\\PDFs\\out.pdf");

		PDDocument document = null;
		PDDocument document_out = null;
		try {
			document = PDDocument.load(pdfFile);
			document_out = PDDocument.load(pdfFile_out);
		} catch (IOException e) {
			e.printStackTrace();
		}

		int pages_size = document.getNumberOfPages();

		System.out.println("getAllPages===============" + pages_size);
		int j = 0;

		for (int i = 0; i < pages_size; i++) {
			PDPage page = document.getPage(i);
			PDPage page1 = document_out.getPage(0);
			PDResources resources = page.getResources();
			Iterable xobjects = resources.getXObjectNames();

			if (xobjects != null) {
				Iterator imageIter = xobjects.iterator();
				while (imageIter.hasNext()) {
					COSName key = (COSName) imageIter.next();
					if (resources.isImageXObject(key)) {
						try {
							PDImageXObject image = (PDImageXObject) resources.getXObject(key);

							// 方式一：将PDF文档中的图片 分别存到一个空白PDF中。
							PDPageContentStream contentStream = new PDPageContentStream(document_out, page1, AppendMode.APPEND,
									true);

							float scale = 1f;
							contentStream.drawImage(image, 20, 20, image.getWidth() * scale, image.getHeight() * scale);
							contentStream.close();
							document_out.save("C:\\Users\\weidx\\Documents\\My Access-IS Data\\PDFs\\" + j + ".pdf");

							System.out.println(image.getSuffix() + "," + image.getHeight() + "," + image.getWidth());

							/**
							 * // 方式二：将PDF文档中的图片 分别另存为图片。 File file = new
							 * File("/Users/xiaolong/Downloads/123"+j+".png");
							 * FileOutputStream out = new
							 * FileOutputStream(file);
							 * 
							 * InputStream input = image.createInputStream();
							 * 
							 * int byteCount = 0; byte[] bytes = new byte[1024];
							 * 
							 * while ((byteCount = input.read(bytes)) > 0) {
							 * out.write(bytes,0,byteCount); }
							 * 
							 * out.close(); input.close();
							 **/

						} catch (IOException e) {
							e.printStackTrace();
						}
						j++;
					}
				}
			}
		}
		System.out.println(j);
	}

}








提供另外一种思路


 
 
public static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";     

     

/**   

 * 解析pdf文档信息   

 * @param pdfPath   pdf文档路径   

 * @throws Exception   

 */    

public static void pdfParse( String pdfPath, String imgSavePath ) throws Exception     

{     

    InputStream input = null;     

    File pdfFile = new File( pdfPath );     

    PDDocument document = null;     

    try{     

        input = new FileInputStream( pdfFile );     

        //加载 pdf 文档     

        document = PDDocument.load( input );     

             

        /** 文档属性信息 **/    

        PDDocumentInformation info = document.getDocumentInformation();     

        System.out.println( "标题:" + info.getTitle() );     

        System.out.println( "主题:" + info.getSubject() );     

        System.out.println( "作者:" + info.getAuthor() );     

        System.out.println( "关键字:" + info.getKeywords() );     

             

        System.out.println( "应用程序:" + info.getCreator() );     

        System.out.println( "pdf 制作程序:" + info.getProducer() );     

             

        System.out.println( "作者:" + info.getTrapped() );     

             

        System.out.println( "创建时间:" + dateFormat( info.getCreationDate() ));     

        System.out.println( "修改时间:" + dateFormat( info.getModificationDate()));     

    

        /** 文档页面信息 **/    

        PDDocumentCatalog cata = document.getDocumentCatalog();     

        List pages = cata.getAllPages();     

        int count = 1;     

        for( int i = 0; i < pages.size(); i++ )     

        {     

            PDPage page = ( PDPage ) pages.get( i );     

            if( null != page )     

            {     

                PDResources res = page.findResources();     

                     

                //获取页面图片信息     

                Map imgs = res.getImages();     

                if( null != imgs )     

                {     

                    Set keySet = imgs.keySet();     

                    Iterator it = keySet.iterator();     

                    while( it.hasNext() )     

                    {     

                        Object obj =  it.next();     

                        PDXObjectImage img = ( PDXObjectImage ) imgs.get( obj );     

                        img.write2file( imgSavePath + count );     

                        count++;     

                    }     

                }     

            }     

        }     

    }catch( Exception e)     

    {     

        throw e;     

    }finally{     

        if( null != input )     

            input.close();     

        if( null != document )     

            document.close();     

    }     

}     

     

/**   

 * 获取格式化后的时间信息   

 * @param dar   时间信息   

 * @return   

 * @throws Exception   

 */    

public static String dateFormat( Calendar calendar ) throws Exception     

{     

    if( null == calendar )     

        return null;     

    String date = null;     

    try{     

        String pattern = DATE_FORMAT;     

        SimpleDateFormat format = new SimpleDateFormat( pattern );     

        date = format.format( calendar.getTime() );     

    }catch( Exception e )     

    {     

        throw e;     

    }     

    return date == null ? "" : date;     

}     

 
 
  
  
 
 

 
 
  
  以下是其他的操作，仅供参考
 
 

 
 
  
  [java] view plain copy print?
import java.io.FileInputStream;  
import java.io.FileNotFoundException;  
import java.io.FileOutputStream;  
import java.io.IOException;  
import java.io.OutputStreamWriter;  
import java.io.Writer;  
import java.text.SimpleDateFormat;  
import java.util.Calendar;  
import java.util.List;  
import java.util.Map;  
import java.util.logging.Level;  
import java.util.logging.Logger;  
import org.apache.pdfbox.pdfparser.PDFParser;  
import org.apache.pdfbox.pdmodel.PDDocument;  
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;  
import org.apache.pdfbox.pdmodel.PDDocumentInformation;  
import org.apache.pdfbox.pdmodel.PDPage;  
import org.apache.pdfbox.pdmodel.PDResources;  
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;  
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;  
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;  
import org.apache.pdfbox.util.PDFTextStripper;  
importstatic readPDFContent.PDFParse.dateFormat;  
  
/** 
 * 
 * @author Angela 
 */publicclassPDFReader {/**  
     * 获取格式化后的时间信息  
     * @param calendar   时间信息  
     * @return     */publicstatic String dateFormat( Calendar calendar ){    
        if( null == calendar )    
            returnnull;    
        String date = null;      
        String pattern = "yyyy-MM-dd HH:mm:ss";    
        SimpleDateFormat format = new SimpleDateFormat( pattern );    
        date = format.format( calendar.getTime() );    
        return date == null ? "" : date;    
    }    
  
        /**打印纲要**/publicstaticvoidgetPDFOutline(String file){  
        try {    
            //打开pdf文件流  
            FileInputStream fis = new   FileInputStream(file);  
            //加载 pdf 文档,获取PDDocument文档对象  
            PDDocument document=PDDocument.load(fis);  
            //获取PDDocumentCatalog文档目录对象  
            PDDocumentCatalog catalog=document.getDocumentCatalog();  
            //获取PDDocumentOutline文档纲要对象  
            PDDocumentOutline outline=catalog.getDocumentOutline();  
            //获取第一个纲要条目（标题1）  
            PDOutlineItem item=outline.getFirstChild();  
            if(outline!=null){  
                //遍历每一个标题1while(item!=null){  
                    //打印标题1的文本  
                    System.out.println("Item:"+item.getTitle());  
                    //获取标题1下的第一个子标题（标题2）  
                    PDOutlineItem child=item.getFirstChild();   
                    //遍历每一个标题2while(child!=null){  
                        //打印标题2的文本  
                        System.out.println("    Child:"+child.getTitle());  
                        //指向下一个标题2  
                        child=child.getNextSibling();  
                    }  
                    //指向下一个标题1  
                    item=item.getNextSibling();  
                }  
            }  
            //关闭输入流  
            document.close();  
            fis.close();  
        } catch (FileNotFoundException ex) {  
            Logger.getLogger(PDFBOXReader.class.getName()).log(Level.SEVERE, null, ex);  
        } catch (IOException ex) {  
            Logger.getLogger(PDFBOXReader.class.getName()).log(Level.SEVERE, null, ex);  
        }   
    }  
  
    /**打印一级目录**/publicstaticvoidgetPDFCatalog(String file){  
        try {    
            //打开pdf文件流  
            FileInputStream fis = new   FileInputStream(file);  
            //加载 pdf 文档,获取PDDocument文档对象  
            PDDocument document=PDDocument.load(fis);  
            //获取PDDocumentCatalog文档目录对象  
            PDDocumentCatalog catalog=document.getDocumentCatalog();  
            //获取PDDocumentOutline文档纲要对象  
            PDDocumentOutline outline=catalog.getDocumentOutline();  
            //获取第一个纲要条目（标题1）if(outline!=null){  
                PDOutlineItem item=outline.getFirstChild();  
                //遍历每一个标题1while(item!=null){  
                    //打印标题1的文本  
                    System.out.println("Item:"+item.getTitle());                 
                    //指向下一个标题1  
                    item=item.getNextSibling();  
                }  
            }  
            //关闭输入流  
            document.close();  
            fis.close();  
        } catch (FileNotFoundException ex) {  
            Logger.getLogger(PDFBOXReader.class.getName()).log(Level.SEVERE, null, ex);  
        } catch (IOException ex) {  
            Logger.getLogger(PDFBOXReader.class.getName()).log(Level.SEVERE, null, ex);  
        }   
    }  
  
    /**获取PDF文档元数据**/publicstaticvoidgetPDFInformation(String file){  
        try {    
            //打开pdf文件流  
            FileInputStream fis = new   FileInputStream(file);  
            //加载 pdf 文档,获取PDDocument文档对象  
            PDDocument document=PDDocument.load(fis);  
            /** 文档属性信息 **/            PDDocumentInformation info = document.getDocumentInformation();   
  
            System.out.println("页数:"+document.getNumberOfPages());  
  
            System.out.println( "标题:" + info.getTitle() );    
            System.out.println( "主题:" + info.getSubject() );    
            System.out.println( "作者:" + info.getAuthor() );    
            System.out.println( "关键字:" + info.getKeywords() );               
  
            System.out.println( "应用程序:" + info.getCreator() );    
            System.out.println( "pdf 制作程序:" + info.getProducer() );    
  
            System.out.println( "Trapped:" + info.getTrapped() );    
  
            System.out.println( "创建时间:" + dateFormat( info.getCreationDate() ));    
            System.out.println( "修改时间:" + dateFormat( info.getModificationDate()));    
  
            //关闭输入流  
            document.close();  
            fis.close();  
        } catch (FileNotFoundException ex) {  
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);  
        } catch (IOException ex) {  
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);  
        }   
    }  
  
    /**提取pdf文本**/publicstaticvoidextractTXT(String file){  
        try{  
            //打开pdf文件流  
            FileInputStream fis = new   FileInputStream(file);  
            //实例化一个PDF解析器  
            PDFParser parser = new PDFParser(fis);  
            //解析pdf文档  
            parser.parse();  
            //获取PDDocument文档对象  
            PDDocument document=parser.getPDDocument();  
            //获取一个PDFTextStripper文本剥离对象             
            PDFTextStripper stripper = new PDFTextStripper();  
            //获取文本内容  
            String content = stripper.getText(document);   
            //打印内容  
            System.out.println( "内容:" + content );     
            document.close();  
            fis.close();  
        } catch (FileNotFoundException ex) {  
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);  
        } catch (IOException ex) {  
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);  
        }  
    }  
  
    /** 
     * 提取部分页面文本 
     * @param file pdf文档路径 
     * @param startPage 开始页数 
     * @param endPage 结束页数 
     */publicstaticvoidextractTXT(String file,int startPage,int endPage){  
        try{  
            //打开pdf文件流  
            FileInputStream fis = new   FileInputStream(file);  
            //实例化一个PDF解析器  
            PDFParser parser = new PDFParser(fis);  
            //解析pdf文档  
            parser.parse();  
            //获取PDDocument文档对象  
            PDDocument document=parser.getPDDocument();  
            //获取一个PDFTextStripper文本剥离对象             
            PDFTextStripper stripper = new PDFTextStripper();  
            // 设置起始页  
            stripper.setStartPage(startPage);  
            // 设置结束页  
            stripper.setEndPage(endPage);  
            //获取文本内容  
            String content = stripper.getText(document);   
            //打印内容  
            System.out.println( "内容:" + content );     
            document.close();  
            fis.close();  
        } catch (FileNotFoundException ex) {  
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);  
        } catch (IOException ex) {  
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);  
        }  
    }  
  
    /** 
     * 提取图片并保存 
     * @param file PDF文档路径 
     * @param imgSavePath 图片保存路径 
     */publicstaticvoidextractImage(String file,String imgSavePath){  
        try{  
            //打开pdf文件流  
            FileInputStream fis = new   FileInputStream(file);  
            //加载 pdf 文档,获取PDDocument文档对象  
            PDDocument document=PDDocument.load(fis);             
            /** 文档页面信息 **///获取PDDocumentCatalog文档目录对象  
            PDDocumentCatalog catalog = document.getDocumentCatalog();  
            //获取文档页面PDPage列表  
            List pages = catalog.getAllPages();    
            int count = 1;    
            int pageNum=pages.size();   //文档页数//遍历每一页for( int i = 0; i < pageNum; i++ ){    
                //取得第i页  
                PDPage page = ( PDPage ) pages.get( i );   
                if( null != page ){    
                    PDResources resource = page.findResources();                        
                    //获取页面图片信息   
                    Map<String,PDXObjectImage> imgs = resource.getImages();                      
                    for(Map.Entry<String,PDXObjectImage> me: imgs.entrySet()){  
                        //System.out.println(me.getKey());  
                        PDXObjectImage img = me.getValue();    
                        //保存图片，会自动添加图片后缀类型  
                        img.write2file( imgSavePath + count );    
                        count++;    
                    }    
                }    
            }    
            document.close();  
            fis.close();  
        } catch (FileNotFoundException ex) {  
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);  
        } catch (IOException ex) {  
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);  
        }  
    }  
  
    /** 
     * 提取文本并保存 
     * @param file PDF文档路径 
     * @param savePath 文本保存路径 
     */publicstaticvoidextractTXT(String file,String savePath){  
        try{  
            //打开pdf文件流  
            FileInputStream fis = new   FileInputStream(file);  
            //实例化一个PDF解析器  
            PDFParser parser = new PDFParser(fis);  
            //解析pdf文档  
            parser.parse();  
            //获取PDDocument文档对象  
            PDDocument document=parser.getPDDocument();  
            //获取一个PDFTextStripper文本剥离对象             
            PDFTextStripper stripper = new PDFTextStripper();  
            //创建一个输出流  
            Writer writer=new OutputStreamWriter(new FileOutputStream(savePath));  
            //保存文本内容  
            stripper.writeText(document, writer);               
            //关闭输出流  
            writer.close();  
            //关闭输入流  
            document.close();  
            fis.close();  
        } catch (FileNotFoundException ex) {  
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);  
        } catch (IOException ex) {  
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);  
        }  
    }  
  
    /** 
     * 提取部分页面文本并保存 
     * @param file PDF文档路径 
     * @param startPage 开始页数 
     * @param endPage 结束页数 
     * @param savePath 文本保存路径 
     */publicstaticvoidextractTXT(String file,int startPage,  
            int endPage,String savePath){  
        try{  
            //打开pdf文件流  
            FileInputStream fis = new   FileInputStream(file);  
            //实例化一个PDF解析器  
            PDFParser parser = new PDFParser(fis);  
            //解析pdf文档  
            parser.parse();  
            //获取PDDocument文档对象  
            PDDocument document=parser.getPDDocument();  
            //获取一个PDFTextStripper文本剥离对象             
            PDFTextStripper stripper = new PDFTextStripper();  
            //创建一个输出流  
            Writer writer=new OutputStreamWriter(new FileOutputStream(savePath));  
            // 设置起始页  
            stripper.setStartPage(startPage);  
            // 设置结束页  
            stripper.setEndPage(endPage);  
            //保存文本内容  
            stripper.writeText(document, writer);               
            //关闭输出流  
            writer.close();  
            //关闭输入流  
            document.close();  
            fis.close();  
        } catch (FileNotFoundException ex) {  
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);  
        } catch (IOException ex) {  
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);  
        }  
    }  
  
    publicstaticvoidmain(String args[]){  
        String file="F:\\pdf\\2013\\000608_阳光股份_2013年年度报告(更新后)_1.pdf";  
        String savePath="E:\\result1.txt";  
        long startTime=System.currentTimeMillis();  
        extractTXT(file,savePath);  
        long endTime=System.currentTimeMillis();  
        System.out.println("读写所用时间为："+(endTime-startTime)+"ms");  
    }  
  
}
暮鼓晨钟nbv
关注
3
点赞
踩
4

收藏

觉得还不错? 一键收藏
1
评论
PDF读取框架pdfbox 图片读取和存储以及创建新pdf

Apache PDFbox是一个开源的、基于Java的、支持PDF文档生成的工具库，它可以用于创建新的PDF文档，修改现有的PDF文档，还可以从PDF文档中提取所需的内容。Apache PDFBox还包含了数个命令行工具。 Apache PDFbox于2016年4月26日发布了最新的2.0.1版。备注：本文代码均是基于2.0及以上版本编写。官网地址：https://pdfb
复制链接

扫一扫