pdf util-CSDN博客

本文链接：https://blog.csdn.net/yingcly003/article/details/96864325
/*
 * Copyright 2018 Smyfinancial.com All right reserved. This software is the
 * confidential and proprietary information of Smyfinancial.com ("Confidential
 * Information"). You shall not disclose such Confidential Information and shall
 * use it only in accordance with the terms of the license agreement you entered
 * into with Smyfinancial.com.
 */
package com.ying.common;

import java.awt.image.BufferedImage;
import java.io.*;
import java.util.List;
import java.util.ListIterator;

import com.google.common.collect.Lists;
import com.lowagie.text.pdf.PdfReader;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.multipdf.PDFMergerUtility;
import org.apache.pdfbox.multipdf.Splitter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.smy.ass.filesearch.IndexItem;
import com.smy.ass.filesearch.Indexer;
import com.smy.ass.filesearch.Searcher;

import javax.imageio.ImageIO;

/**
 * 类PdfUtil.java的实现描述：pdf工具类
 *
 * @author CAIYONGFENG 2018年3月8日 下午6:10:15
 */
public class PdfUtil {

    private static Logger       logger              = LoggerFactory.getLogger(PdfUtil.class);
    //lucene索引文件存储路径
    //    private static final String INDEX_DIR = Property.getProperty("file.index.dir");
    private static final String INDEX_DIR           = "";
    private static final int    DEFAULT_RESULT_SIZE = 100;

    /**
     * 获取pdf文件的页数
     *
     * @param pdfFile pdf文件对象
     * @return 页数
     */
    public static Integer getPageCount(File pdfFile) {
        try {
            if (pdfFile == null) {
                throw new Exception("文件为空,请检查!");
            }
            if (!(pdfFile.getName().endsWith(".pdf") || pdfFile.getName().endsWith(".PDF"))) {
                throw new Exception("文件非pdf格式,请检查!");
            }
            PDDocument pdd = PDDocument.load(pdfFile);
            PDPageTree pages = pdd.getDocumentCatalog().getPages();
            return pages.getCount();
        } catch (Exception e) {
            logger.error("获取Pdf文件页数异常(默认返回1页)，原因：", e);
            return null;
        }
    }

    /**
     * 获取pdf文件的页数
     *
     * @param bytes pdf文件对象
     * @return 页数
     */
    public static Integer getPageCountByByte(byte[] bytes) {
        try {
            if (bytes == null) {
                throw new Exception("文件为空,请检查!");
            }
            PDDocument pdd = PDDocument.load(bytes);
            PDPageTree pages = pdd.getDocumentCatalog().getPages();
            return pages.getCount();
        } catch (Exception e) {
            logger.error("获取Pdf文件页数异常(默认返回1页)，原因：", e);
            return null;
        }
    }

    /**
     * 获取pdf文件的页数
     *
     * @param pdfFilePath pdf文件路径
     * @return 页数
     */
    public static Integer getPageCount(String pdfFilePath) {
        try {
            if (StringUtils.isBlank(pdfFilePath)) {
                throw new Exception("文件路径为空");
            }
            File pdfFile = new File(pdfFilePath);
            return getPageCount(pdfFile);
        } catch (Exception e) {
            logger.error("获取pdf文件的页数异常，原因：", e);
            return null;
        }
    }

    /**
     * 搜索pdf文件的内容
     *
     * @param pdfFile
     * @param str
     * @return
     */
    @Deprecated
    public static boolean isContainStr(File pdfFile, String str) {
        try {
            long start = System.currentTimeMillis();
            IndexItem pdfIndexItem = index(pdfFile);

            //创建索引器
            Indexer indexer = new Indexer(INDEX_DIR);
            indexer.index(pdfIndexItem);
            indexer.close();

            //创建搜索器
            Searcher searcher = new Searcher(INDEX_DIR);
            int result = searcher.findByContent(str, DEFAULT_RESULT_SIZE);
            logger.info("cost:" + (System.currentTimeMillis() - start) + " ms");
            if (result == 1) {
                return true;
            } else {
                return false;
            }
        } catch (Exception e) {
            logger.error("判断pdf文件是否包含指定字符串失败，原因：", e);
            throw new RuntimeException(e);
        }
    }

    //生成文件的索引信息
    private static IndexItem index(File file) throws IOException {
        org.apache.pdfbox.pdmodel.PDDocument doc = org.apache.pdfbox.pdmodel.PDDocument.load(file);
        String content = new org.apache.pdfbox.text.PDFTextStripper().getText(doc);
        doc.close();
        return new IndexItem((long) file.getName().hashCode(), file.getName(), content);
    }

    /**
     * pdf转word
     * 
     * @param pdfFile pdf文件
     * @param routerDirFullPath 存储全路径
     */
    public static void pdf2doc(File pdfFile, String routerDirFullPath) {
        try {
            PDDocument doc = PDDocument.load(pdfFile);
            int pagenumber = doc.getDocumentCatalog().getPages().getCount();

            FileOutputStream fos = new FileOutputStream(routerDirFullPath);
            Writer writer = new OutputStreamWriter(fos, "UTF-8");
            PDFTextStripper stripper = new PDFTextStripper();

            stripper.setSortByPosition(true);//排序
            //stripper.setWordSeparator("");//pdfbox对中文默认是用空格分隔每一个字，通过这个语句消除空格（视频是这么说的）
            stripper.setStartPage(1);//设置转换的开始页
            stripper.setEndPage(pagenumber);//设置转换的结束页
            stripper.writeText(doc, writer);
            writer.close();
            doc.close();
        } catch (InvalidPasswordException e) {
            logger.error("pdf2doc error 密码错误！reason:", e);
        } catch (Exception e) {
            logger.error("pdf2doc error!reason:", e);
        }
    }

    /**
     * 图片转pdf
     */
    public static void imageToPdf(String idCardBackUrl, String idCardFrontUrl) {
        PDDocument document = null;
        PDPageContentStream contentStream = null;
        InputStream input = null;
        ByteArrayOutputStream baOut = new ByteArrayOutputStream();
        try {
            // 创建pdf文档和A4空白页
            document = new PDDocument();
            PDPage blankPage = new PDPage(PDRectangle.A4);
            document.addPage(blankPage);

            // 获取身份证图片
            PDImageXObject idCardBackImg = PDImageXObject.createFromFile(idCardBackUrl, document);
            PDImageXObject idCardFrontImg = PDImageXObject.createFromFile(idCardFrontUrl, document);

            // 图片等比例缩放
            float backZoomWidth;
            float backZoomHeight;
            if (idCardBackImg.getWidth() >= idCardBackImg.getHeight()) {
                backZoomWidth = 320f;
                backZoomHeight = 320f / idCardBackImg.getWidth() * idCardBackImg.getHeight();
            } else {
                backZoomHeight = 320f;
                backZoomWidth = 320f / idCardBackImg.getHeight() * idCardBackImg.getWidth();
            }
            float frontZoomWidth;
            float frontZoomHeight;
            if (idCardFrontImg.getWidth() >= idCardFrontImg.getHeight()) {
                frontZoomWidth = 320f;
                frontZoomHeight = 320f / idCardFrontImg.getWidth() * idCardFrontImg.getHeight();
            } else {
                frontZoomHeight = 320f;
                frontZoomWidth = 320f / idCardFrontImg.getHeight() * idCardFrontImg.getWidth();
            }

            // A4页宽高
            float boxWith = blankPage.getCropBox().getWidth();
            float boxHeight = blankPage.getCropBox().getHeight();

            contentStream = new PDPageContentStream(document, blankPage);
            contentStream.drawImage(idCardBackImg, (boxWith - backZoomWidth) / 2, (boxHeight / 2 - backZoomHeight) / 2,
                    backZoomWidth, backZoomHeight);
            contentStream.drawImage(idCardFrontImg, (boxWith - frontZoomWidth) / 2,
                    (boxHeight / 2 - frontZoomHeight) / 2 + boxHeight / 2, frontZoomWidth, frontZoomHeight);
            contentStream.close();
            document.save(baOut);

            input = new ByteArrayInputStream(baOut.toByteArray());
            FileUtils.writeByteArrayToFile(new File("D:\\test\\testPdf.pdf"), IOUtils.toByteArray(input));
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            IOUtils.closeQuietly(document);
            IOUtils.closeQuietly(baOut);
            IOUtils.closeQuietly(input);
        }
    }

    /**
     * 合并pdf文件
     * 
     * @param pdfFullPaths pdf文件路径
     * @param desPdfPath 合并后的pdf文件路径
     */
    public static void mergePdf(String[] pdfFullPaths, String desPdfPath) {
        try {
            PDFMergerUtility mergePdf = new PDFMergerUtility();
            for (int i = 0; i < pdfFullPaths.length; i++) {
                mergePdf.addSource(pdfFullPaths[i]);
            }
            mergePdf.setDestinationFileName(desPdfPath);
            MemoryUsageSetting memUsageSetting = MemoryUsageSetting.setupTempFileOnly();
            mergePdf.mergeDocuments(memUsageSetting);
        } catch (FileNotFoundException e) {
            logger.error("文件不存在,原因：", e);
        } catch (Exception e) {
            logger.error("合并pdf文件异常，原因：", e);
        }
    }

    /**
     * 等比例缩放
     */
    private static float getZoomSize(float widthImg, float heightImg, float zoomWidth) {
        return zoomWidth * heightImg / widthImg;
    }

    /**
     * 获取pdf文件内容
     * 
     * @param file
     * @return
     */
    public static String getPdfText(File file) {
        try {
            org.apache.pdfbox.pdmodel.PDDocument doc = org.apache.pdfbox.pdmodel.PDDocument.load(file);
            String content = new PDFTextStripper().getText(doc);
            return content;
        } catch (IOException e) {
            logger.error("pdf获取文字内容出错", e);
        }
        return null;
    }

    /**
     * 合并指定页并生成新PDF文件
     * @param pdfPath
     * @param mergedFileName
     * @param savePath
     * @param pageNumbers
     * @return
     */
    public static String mergerPages(String pdfPath,String mergedFileName,String savePath ,int ... pageNumbers) {
        try {
            List<String> mergedFilePaths = Lists.newArrayList();
            Integer pageCount = getPageCount(new File(pdfPath));
            //generate pageFile
            for (int pageNumber:pageNumbers) {
                if(pageNumber > pageCount){
                    throw new Exception("指定页码不存在！");
                }
                String splitFileName = mergedFileName.concat("_").concat(String.valueOf(pageNumber));
                String mergedFilePath = splitPdf(pageNumber, pdfPath,splitFileName , savePath);
                mergedFilePaths.add(mergedFilePath);
            }

            PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
            //merge pageFile to new Pdf File
            List<InputStream> fileInputStreamList = Lists.newArrayList();
            for (String mergedFilePath:mergedFilePaths) {
                File pageFile = new File(mergedFilePath);
                FileInputStream fileInputStream = new FileInputStream(pageFile);
                fileInputStreamList.add(fileInputStream);
            }

            pdfMergerUtility.addSources(fileInputStreamList);
            String realFilePath = FilePathUtil.getRealFilePath(savePath.concat(File.separator).concat(mergedFileName));
            pdfMergerUtility.setDestinationFileName(realFilePath);
            pdfMergerUtility.mergeDocuments(MemoryUsageSetting.setupMainMemoryOnly());
            //close stream
            for (InputStream inputStream : fileInputStreamList) {
                if(inputStream!=null){
                    inputStream.close();
                }
            }
            //delete tmpFile
            for (String mergedFilePath : mergedFilePaths) {
                File pageFile = new File(mergedFilePath);
                pageFile.delete();
            }
            return realFilePath;
        }catch (Exception e){
            logger.error("合并PDF页面失败，原因：",e);
        }
        return null;
    }

    /**
     * 分割pdf文件某一页并保存为一个新pdf文件
     * @param pageNum 页码
     * @param filePath pdf源文件全路径
     * @param fileName 新文件名称
     * @param outPath 新文件保存的路径
     * @return 新文件的全路径
     */
    public static String splitPdf(int pageNum, String filePath, String fileName, String outPath) {
        File indexFile = new File(filePath);// 这是对应文件名
        PDDocument document = null;
        try {
            document = PDDocument.load(indexFile);
            Splitter splitter = new Splitter();
            splitter.setStartPage(pageNum);
            splitter.setEndPage(pageNum);
            java.util.List<PDDocument> pages = splitter.split(document);
            ListIterator<PDDocument> iterator = pages.listIterator();
            while (iterator.hasNext()) {
                File file = new File(outPath);
                if (!file.exists()) {
                    file.mkdirs();
                }
                PDDocument pd = iterator.next();
                File newFile = new File(outPath + fileName);
                if (newFile.exists()) {
                    newFile.delete();
                }
                pd.save(outPath + fileName);
                pd.close();
                if (newFile.exists()) {
                    return newFile.getPath();
                }
            }
            document.close();
        } catch (IOException e) {
            logger.error("分割PDF文件失败，原因：",e);
        } catch (Exception e) {
            logger.error("分割PDF文件异常，原因：",e);
        }
        return null;
    }

    /**
     * 在PDF文件中查找指定文本
     * @param pdfPath pdf文件全路径
     * @return 页码
     */
    public static int findInPdf(String pdfPath,String keyWords){
        try {
            File pdfFile = new File(pdfPath);
            org.apache.pdfbox.pdmodel.PDDocument doc = org.apache.pdfbox.pdmodel.PDDocument.load(pdfFile);
            PDPageTree pages = doc.getPages();
            int index = -1;
            for (int i = 0; i < pages.getCount(); i++) {
                PDFTextStripper pdfTextStripper = new PDFTextStripper();
                pdfTextStripper.setStartPage(i);
                pdfTextStripper.setEndPage(i);
                String text = pdfTextStripper.getText(doc);
                int result = text.indexOf(keyWords);
                if (result != -1) {
                    index = i;
                    break;
                }
            }
            doc.close();

            return index;
        }catch (Exception e){
            logger.error("查找文本异常，原因：",e);
            throw new RuntimeException(e);
        }
    }


    public static void pdf2Image(String pdfFilePath, String imageFilePath, int rotation) {
        File file = new File(pdfFilePath);
        PDDocument pdDocument;
        try {

            long startPdfMills = System.currentTimeMillis();

            String imgFolderPath = imageFilePath;

            pdDocument = PDDocument.load(file);
            PDFRenderer renderer = new PDFRenderer(pdDocument);
            /* dpi越大转换后越清晰，相对转换速度越慢 */
            PdfReader reader = new PdfReader(pdfFilePath);
            int pages = reader.getNumberOfPages();

            for (int i = 0; i < pages; i++) {
                long createImgStartMills = System.currentTimeMillis();

                File dstFile = new File(imgFolderPath);
                if (!dstFile.exists()) {
                    dstFile.createNewFile();
                }

                // 旋转角度
                if (rotation > 0) {
                    pdDocument.getPage(i).setRotation(rotation);
                }

                BufferedImage image = renderer.renderImageWithDPI(i, 144);
                ImageIO.write(image, "jpg", dstFile);
                logger.info("生成图片：{} time:{}ms.", imgFolderPath, System.currentTimeMillis()-createImgStartMills);
                break;
            }

        } catch (IOException e) {
            logger.error("pdf2img fail pdfFilePath:{}", pdfFilePath, e);
            throw new RuntimeException(e);
        }
    }


    public static void main(String[] args) {
        // String pdfPath = "D:\\data\\欧春蕊.pdf";D091704250000120005
        // System.out.println(getPdfText(new File(pdfPath)));


        /*
        String pdfPath2 = "D:\\1.smy_docs\\2期-201706（全）.PDF";
        int index = findInPdf(pdfPath2, "D091704250000120005");
        System.out.println("index:" + index);
        */

/*        String mergedPath = PdfUtil.mergerPages("C:\\Users\\shan\\Desktop\\诉讼四期优化\\实时代偿3期-201710.pdf",
                "merged.pdf", "d:\\", 1, 3);
        System.out.println(mergedPath);*/

        File file = new File("C:\\Users\\shan\\Desktop\\诉讼四期优化\\实时代偿3期-201710.pdf");
        Integer pageCount = PdfUtil.getPageCount(file);

    }

}