（8）PDFBOX读取PDF（元数据、纲要、文本、图片）

最新推荐文章于 2024-03-06 16:47:40 发布

Fighting_No1

最新推荐文章于 2024-03-06 16:47:40 发布

阅读量1.3w

点赞数 4

分类专栏：文件读写文章标签： pdf pdfbox 读取pdf 提取pdf文本 java

本文链接：https://blog.csdn.net/fighting_no1/article/details/51038966

版权

文件读写专栏收录该内容

10 篇文章 0 订阅

订阅专栏

PDFBox是Java实现的PDF文档协作类库，提供PDF文档的创建、处理以及文档内容提取功能，也包含了一些命令行实用工具。其主要特性包括：
1、提取PDF文件的Unicode文本
2、将PDF切分成多个PDF文件或合并多个PDF文件
3、从PDF表格中提取数据或填写PDF表格
4、验证PDF文件是否符合PDF/A-1b标准
5、使用标准的java API打印PDF文件
6、将PDF文件保存为图像文件，如PNG、JPEG
7、创建一个PDF文件，包含嵌入的字体和图像
8、PDF文件进行数字签名，即对PDF 文档进行加密与解密

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.util.PDFTextStripper;
import static readPDFContent.PDFParse.dateFormat;

/**
 *
 * @author Angela
 */
public class PDFReader {

    /** 
     * 获取格式化后的时间信息 
     * @param calendar   时间信息 
     * @return 
     */  
    public static String dateFormat( Calendar calendar ){  
        if( null == calendar )  
            return null;  
        String date = null;    
        String pattern = "yyyy-MM-dd HH:mm:ss";  
        SimpleDateFormat format = new SimpleDateFormat( pattern );  
        date = format.format( calendar.getTime() );  
        return date == null ? "" : date;  
    }  

        /**打印纲要**/
    public static void getPDFOutline(String file){
        try {  
            //打开pdf文件流
            FileInputStream fis = new   FileInputStream(file);
            //加载 pdf 文档,获取PDDocument文档对象
            PDDocument document=PDDocument.load(fis);
            //获取PDDocumentCatalog文档目录对象
            PDDocumentCatalog catalog=document.getDocumentCatalog();
            //获取PDDocumentOutline文档纲要对象
            PDDocumentOutline outline=catalog.getDocumentOutline();
            //获取第一个纲要条目（标题1）
            PDOutlineItem item=outline.getFirstChild();
            if(outline!=null){
                //遍历每一个标题1
                while(item!=null){
                    //打印标题1的文本
                    System.out.println("Item:"+item.getTitle());
                    //获取标题1下的第一个子标题（标题2）
                    PDOutlineItem child=item.getFirstChild(); 
                    //遍历每一个标题2
                    while(child!=null){
                        //打印标题2的文本
                        System.out.println("    Child:"+child.getTitle());
                        //指向下一个标题2
                        child=child.getNextSibling();
                    }
                    //指向下一个标题1
                    item=item.getNextSibling();
                }
            }
            //关闭输入流
            document.close();
            fis.close();
        } catch (FileNotFoundException ex) {
            Logger.getLogger(PDFBOXReader.class.getName()).log(Level.SEVERE, null, ex);
        } catch (IOException ex) {
            Logger.getLogger(PDFBOXReader.class.getName()).log(Level.SEVERE, null, ex);
        } 
    }

    /**打印一级目录**/
    public static void getPDFCatalog(String file){
        try {  
            //打开pdf文件流
            FileInputStream fis = new   FileInputStream(file);
            //加载 pdf 文档,获取PDDocument文档对象
            PDDocument document=PDDocument.load(fis);
            //获取PDDocumentCatalog文档目录对象
            PDDocumentCatalog catalog=document.getDocumentCatalog();
            //获取PDDocumentOutline文档纲要对象
            PDDocumentOutline outline=catalog.getDocumentOutline();
            //获取第一个纲要条目（标题1）
            if(outline!=null){
                PDOutlineItem item=outline.getFirstChild();
                //遍历每一个标题1
                while(item!=null){
                    //打印标题1的文本
                    System.out.println("Item:"+item.getTitle());               
                    //指向下一个标题1
                    item=item.getNextSibling();
                }
            }
            //关闭输入流
            document.close();
            fis.close();
        } catch (FileNotFoundException ex) {
            Logger.getLogger(PDFBOXReader.class.getName()).log(Level.SEVERE, null, ex);
        } catch (IOException ex) {
            Logger.getLogger(PDFBOXReader.class.getName()).log(Level.SEVERE, null, ex);
        } 
    }

    /**获取PDF文档元数据**/
    public static void getPDFInformation(String file){
        try {  
            //打开pdf文件流
            FileInputStream fis = new   FileInputStream(file);
            //加载 pdf 文档,获取PDDocument文档对象
            PDDocument document=PDDocument.load(fis);
            /** 文档属性信息 **/  
            PDDocumentInformation info = document.getDocumentInformation(); 

            System.out.println("页数:"+document.getNumberOfPages());

            System.out.println( "标题:" + info.getTitle() );  
            System.out.println( "主题:" + info.getSubject() );  
            System.out.println( "作者:" + info.getAuthor() );  
            System.out.println( "关键字:" + info.getKeywords() );             

            System.out.println( "应用程序:" + info.getCreator() );  
            System.out.println( "pdf 制作程序:" + info.getProducer() );  

            System.out.println( "Trapped:" + info.getTrapped() );  

            System.out.println( "创建时间:" + dateFormat( info.getCreationDate() ));  
            System.out.println( "修改时间:" + dateFormat( info.getModificationDate()));  

            //关闭输入流
            document.close();
            fis.close();
        } catch (FileNotFoundException ex) {
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
        } catch (IOException ex) {
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
        } 
    }

    /**提取pdf文本**/
    public static void extractTXT(String file){
        try{
            //打开pdf文件流
            FileInputStream fis = new   FileInputStream(file);
            //实例化一个PDF解析器
            PDFParser parser = new PDFParser(fis);
            //解析pdf文档
            parser.parse();
            //获取PDDocument文档对象
            PDDocument document=parser.getPDDocument();
            //获取一个PDFTextStripper文本剥离对象           
            PDFTextStripper stripper = new PDFTextStripper();
            //获取文本内容
            String content = stripper.getText(document); 
            //打印内容
            System.out.println( "内容:" + content );   
            document.close();
            fis.close();
        } catch (FileNotFoundException ex) {
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
        } catch (IOException ex) {
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    /**
     * 提取部分页面文本
     * @param file pdf文档路径
     * @param startPage 开始页数
     * @param endPage 结束页数
     */
    public static void extractTXT(String file,int startPage,int endPage){
        try{
            //打开pdf文件流
            FileInputStream fis = new   FileInputStream(file);
            //实例化一个PDF解析器
            PDFParser parser = new PDFParser(fis);
            //解析pdf文档
            parser.parse();
            //获取PDDocument文档对象
            PDDocument document=parser.getPDDocument();
            //获取一个PDFTextStripper文本剥离对象           
            PDFTextStripper stripper = new PDFTextStripper();
            // 设置起始页
            stripper.setStartPage(startPage);
            // 设置结束页
            stripper.setEndPage(endPage);
            //获取文本内容
            String content = stripper.getText(document); 
            //打印内容
            System.out.println( "内容:" + content );   
            document.close();
            fis.close();
        } catch (FileNotFoundException ex) {
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
        } catch (IOException ex) {
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    /**
     * 提取图片并保存
     * @param file PDF文档路径
     * @param imgSavePath 图片保存路径
     */
    public static void extractImage(String file,String imgSavePath){
        try{
            //打开pdf文件流
            FileInputStream fis = new   FileInputStream(file);
            //加载 pdf 文档,获取PDDocument文档对象
            PDDocument document=PDDocument.load(fis);           
            /** 文档页面信息 **/  
            //获取PDDocumentCatalog文档目录对象
            PDDocumentCatalog catalog = document.getDocumentCatalog();
            //获取文档页面PDPage列表
            List pages = catalog.getAllPages();  
            int count = 1;  
            int pageNum=pages.size();   //文档页数
            //遍历每一页
            for( int i = 0; i < pageNum; i++ ){  
                //取得第i页
                PDPage page = ( PDPage ) pages.get( i ); 
                if( null != page ){  
                    PDResources resource = page.findResources();                      
                    //获取页面图片信息 
                    Map<String,PDXObjectImage> imgs = resource.getImages();                    
                    for(Map.Entry<String,PDXObjectImage> me: imgs.entrySet()){
                        //System.out.println(me.getKey());
                        PDXObjectImage img = me.getValue();  
                        //保存图片，会自动添加图片后缀类型
                        img.write2file( imgSavePath + count );  
                        count++;  
                    }  
                }  
            }  
            document.close();
            fis.close();
        } catch (FileNotFoundException ex) {
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
        } catch (IOException ex) {
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    /**
     * 提取文本并保存
     * @param file PDF文档路径
     * @param savePath 文本保存路径
     */
    public static void extractTXT(String file,String savePath){
        try{
            //打开pdf文件流
            FileInputStream fis = new   FileInputStream(file);
            //实例化一个PDF解析器
            PDFParser parser = new PDFParser(fis);
            //解析pdf文档
            parser.parse();
            //获取PDDocument文档对象
            PDDocument document=parser.getPDDocument();
            //获取一个PDFTextStripper文本剥离对象           
            PDFTextStripper stripper = new PDFTextStripper();
            //创建一个输出流
            Writer writer=new OutputStreamWriter(new FileOutputStream(savePath));
            //保存文本内容
            stripper.writeText(document, writer);             
            //关闭输出流
            writer.close();
            //关闭输入流
            document.close();
            fis.close();
        } catch (FileNotFoundException ex) {
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
        } catch (IOException ex) {
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    /**
     * 提取部分页面文本并保存
     * @param file PDF文档路径
     * @param startPage 开始页数
     * @param endPage 结束页数
     * @param savePath 文本保存路径
     */
    public static void extractTXT(String file,int startPage,
            int endPage,String savePath){
        try{
            //打开pdf文件流
            FileInputStream fis = new   FileInputStream(file);
            //实例化一个PDF解析器
            PDFParser parser = new PDFParser(fis);
            //解析pdf文档
            parser.parse();
            //获取PDDocument文档对象
            PDDocument document=parser.getPDDocument();
            //获取一个PDFTextStripper文本剥离对象           
            PDFTextStripper stripper = new PDFTextStripper();
            //创建一个输出流
            Writer writer=new OutputStreamWriter(new FileOutputStream(savePath));
            // 设置起始页
            stripper.setStartPage(startPage);
            // 设置结束页
            stripper.setEndPage(endPage);
            //保存文本内容
            stripper.writeText(document, writer);             
            //关闭输出流
            writer.close();
            //关闭输入流
            document.close();
            fis.close();
        } catch (FileNotFoundException ex) {
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
        } catch (IOException ex) {
            Logger.getLogger(PDFReader.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    public static void main(String args[]){
        String file="F:\\pdf\\2013\\000608_阳光股份_2013年年度报告(更新后)_1.pdf";
        String savePath="E:\\result1.txt";
        long startTime=System.currentTimeMillis();
        extractTXT(file,savePath);
        long endTime=System.currentTimeMillis();
        System.out.println("读写所用时间为："+(endTime-startTime)+"ms");
    }

}

注意：加密的pdf文档上面的程序读不了，无法解析的pdf文档会报异常，印刷版和图片式的pdf文档无法提取出文本，无法解析。

Fighting_No1

关注

4
点赞
踩
24

收藏

觉得还不错? 一键收藏
5
评论
（8）PDFBOX读取PDF（元数据、纲要、文本、图片）

PDFBox是Java实现的PDF文档协作类库，提供PDF文档的创建、处理以及文档内容提取功能，也包含了一些命令行实用工具。其主要特性包括： 1、提取PDF文件的Unicode文本 2、将PDF切分成多个PDF文件或合并多个PDF文件 3、从PDF表格中提取数据或填写PDF表格 4、验证PDF文件是否符合PDF/A-1b标准 5、使用标准的java API打印PDF文件 6、将PDF文件
复制链接

扫一扫