pdfbox替换模板标签,并将多个pdf合并为一个pdf(有图片插入图片)

替换100个模板中部分标签后,并合并100个pdf模板文档、10个400kb的图片为一个pdf文档

耗时20s左右

 1.导入pdfbox

 <dependencies>
        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.1</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/log4j/log4j -->
        <dependency>
            <groupId>log4j</groupId>
            <artifactId>log4j</artifactId>
            <version>1.2.17</version>
        </dependency>
        <dependency>
            <groupId>maven-repository.junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.13.2</version>
        </dependency>
        <!--   word转pdf     -->
        <dependency>
            <groupId>fr.opensagres.xdocreport</groupId>
            <artifactId>fr.opensagres.poi.xwpf.converter.pdf</artifactId>
            <version>2.0.2</version>
        </dependency>

    </dependencies>

 2.上代码

package main.java;

import fr.opensagres.poi.xwpf.converter.pdf.PdfConverter;
import fr.opensagres.poi.xwpf.converter.pdf.PdfOptions;
import org.apache.log4j.Logger;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.multipdf.PDFMergerUtility;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.junit.Test;

import javax.imageio.ImageIO;
import javax.imageio.ImageReader;
import javax.imageio.stream.ImageInputStream;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;

/**
 * @ClassName PdfboxSummary
 */
public class PdfboxSummary {
    private final static Logger log = Logger.getLogger(PdfboxReplace.class);

    /**
     * 将某个文件夹下的pdf模板文件中的标签替换,如果有图片并把图片插入pdf,生成一个pdf文档
     * @throws Exception
     */
    @Test
    public  void pdfMergeONE() throws Exception {
        //合并为一个文件的文件路径
        String outputFile="D:\\merged.pdf";

        long start = System.currentTimeMillis();
        System.out.println("===start==="+start);

        //需要替换的标签数据key放标签,value放替换成哪个值
        HashMap<String,String> replaceMap = new HashMap();
        replaceMap.put("<<D1>>","D1D1D1");
        replaceMap.put("<<F7>>","F7F7F7");
        replaceMap.put("<<Annual>>","AnnualAnnualAnnual");
        replaceMap.put("<<E6>>","E6E6E6E6E6");
        replaceMap.put("<<Month>>","MonthMonthMonth");
        replaceMap.put("<<EffDate>>","EffDateEffDateEffDate");
        replaceMap.put("<<R22>>","R22R22R22R22");



        PDFMergerUtility pdfMergerUtility = new PDFMergerUtility();
        //pdfMergerUtility.setDestinationFileName(FILEPATH + "test\\merged.pdf");
        PDDocument destination = new PDDocument();

        //获取文件目录下要处理的文件名称列表
        List<String> fileNameList = getFile("D:\\merge");
        for (int i=0;i<fileNameList.size();i++){
            String filePath = fileNameList.get(i);
            String typeStr = filePath.substring(filePath.indexOf(".")+1);
            //System.out.println(typeStr);
            if ("pdf".equalsIgnoreCase(typeStr)){
                //处理pdf文档
                File pdfFile = new File(filePath);
                PDDocument pdfDocument = PDDocument.load(pdfFile);
                for (String key:replaceMap.keySet()) {
                    replacePdfText(pdfDocument, key, replaceMap.get(key));
                }
                //把替换完标签的PDDocument pdfdocument合并到目标文件PDDocument destination
                pdfMergerUtility.appendDocument(destination,pdfDocument);
                pdfDocument.close();

            }
            if ("jpg".equalsIgnoreCase(typeStr)||"png".equalsIgnoreCase(typeStr)
                    ||"jpeg".equalsIgnoreCase(typeStr)){
                //处理插入图片
                insertImageToPdf(destination,filePath);
            }
        }

        //pdf合并
        pdfMergerUtility.mergeDocuments(MemoryUsageSetting.setupMainMemoryOnly());
        destination.save(outputFile);
        destination.close();

        long end = System.currentTimeMillis();
        System.out.println("===end==="+end);
        long total = end - start;
        System.out.println("===total==="+ total);
    }

    /**
     * 向pdf中插入图片
     * @param document
     * @param imagePath
     * @return
     * @throws IOException
     */
    private static PDDocument insertImageToPdf(PDDocument document ,String imagePath) throws IOException {
        PDPage page = new PDPage();
        //创建PDImageXObject对象
        PDImageXObject pdImage = PDImageXObject.createFromFile(imagePath,document);
        //创建PDPageContentStream对象
        PDPageContentStream contents = new PDPageContentStream(document, page);
        //插入图片,图片太大按scale比例缩小
        float pageWidth = page.getMediaBox().getWidth();
        float pageHeight = page.getMediaBox().getHeight();
        int imageHeight = pdImage.getHeight();
        int imageWidth = pdImage.getWidth();
        float scale = pageWidth/imageWidth;
        scale = Math.min(1,scale);
        contents.drawImage(pdImage,  (pageWidth-imageWidth*scale)/2, (pageHeight-imageHeight*scale)/2, imageWidth * scale, imageHeight * scale);

        document.addPage(page);
        contents.close();
        return document;
    }

    /**
     * 替换PDF中标签字符串
     * @param document
     * @param searchString
     * @param replacement
     * @return
     * @throws IOException
     */
    private static PDDocument replacePdfText(PDDocument document, String searchString, String replacement) throws IOException {
        for (PDPage page : document.getPages()) {
            PDFStreamParser parser = new PDFStreamParser(page);
            parser.parse();
            List tokens = parser.getTokens();

            List<COSArray> keyList = new ArrayList();
            String pstring = "";
            boolean isStart = false;

            for (int j = 0; j < tokens.size(); j++) {
                Object next = tokens.get(j);
                if (next instanceof Operator) {
                    Operator op = (Operator) next;

                    //Tj和TJ是在PDF中显示字符串的两个运算符
                    if (op.getName().equals("Tj")) {
                        // Tj takes one operator and that is the string to display so lets update that operator
                        //Tj是一种字符串形式的运算符,所以直接更新就行
                        COSString previous = (COSString) tokens.get(j - 1);
                        String string = previous.getString();
                        string = string.replace(searchString, replacement);
                        previous.setValue(string.getBytes());
                    } else if (op.getName().equals("TJ")) {
                        //Tj是一种字符数组形式的运算符
                        COSArray previous = (COSArray) tokens.get(j - 1);
                        for (int k = 0; k < previous.size(); k++) {
                            Object arrElement = previous.getObject(k);
                            if (arrElement instanceof COSString) {
                                COSString cosString = (COSString) arrElement;
                                String string = cosString.getString();
                                //由于<<A1>>标签解析时,可能被解析成  "<<A1"  ">>"  或 "<"  "<"  "A1"  ">"  ">"
                                //所以下面特殊处理下
                                //System.out.println(string);
                                if (pstring.contains("<<") || string.contains("<<")
                                        || pstring.contains("<") || string.contains("<")) {
                                    pstring += string;
                                }
                            }
                        }

                        if (pstring.contains("<<")) {
                            isStart = true;
                            //System.out.println(pstring);
                        }

                        //if (searchString.equals(pstring.trim())) {
                        if (pstring.contains("<<") && pstring.contains(">>")
                                &&searchString.equals(pstring.trim())) {
                            System.out.println(pstring);
                            keyList.add(previous);

                            for (int i = 0; i < keyList.size(); i++) {
                                COSArray item = keyList.get(i);
                                if (i == 0) {
                                    COSString cosString2 = (COSString) item.getObject(0);
                                    cosString2.setValue(replacement.getBytes());
                                    int total = item.size() - 1;
                                    for (int k = total; k > 0; k--) {
                                        item.remove(k);
                                    }
                                } else {
                                    while (item.size() > 0) {
                                        item.remove(0);
                                    }
                                }
                            }
                            keyList.clear();
                            pstring = "";
                            isStart = false;
                        } else {
                            if (isStart) {
                                keyList.add(previous);
                            }
                        }
                    }

                    if (pstring.contains(">>")) {
                        pstring = "";
                        isStart = false;
                        keyList.clear();
                    }
                }
            }

            PDStream updatedStream = new PDStream(document);
            OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);
            ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
            tokenWriter.writeTokens(tokens);
            out.close();
            page.setContents(updatedStream);
        }

        return document;
    }

    /**
     * 获取path文件夹下文件的路径
     * @param path 文件夹路径
     * @return
     */
    private static List<String> getFile(String path) {
        File file = new File(path);
        //获取文件列表
        File[] array = file.listFiles();
        List<String> fileNameList = new ArrayList<>(100);

        for (int i = 0; i < array.length; i++) {
            if (array[i].isFile()) {
                fileNameList.add(array[i].getPath());
            }
            //else if (array[i].isDirectory()) {
            //    getFile(array[i].getPath());
            //}
        }
        return fileNameList;
    }

    /**
     * 通过stream流的方式向pdf中插入内容
     * @param pdfDocument
     * @param filePath
     * @return
     * @throws Exception
     */
    private static PDDocument insertToPdfByStream(PDDocument pdfDocument,String filePath) throws Exception {
        //Iterator<ImageReader> iterator = ImageIO.getImageReadersByFormatName("tiff");
        Iterator<ImageReader> iterator = ImageIO.getImageReadersByFormatName("jpeg");
        if (!iterator.hasNext()) {
            throw new Exception("The JDK does not support");
        }
        ImageReader imageReader = iterator.next();

        long timeMillis = System.currentTimeMillis();
        try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream();) {
            //ImageInputStream imageInputStream = ImageIO.createImageInputStream(new ByteArrayInputStream(tiffByte));
            ImageInputStream imageInputStream = ImageIO.createImageInputStream(new FileInputStream(filePath));

            imageReader.setInput(imageInputStream);

            int size = imageReader.getNumImages(true);
            for (int i = 0; i < size; i++) {
                BufferedImage image = imageReader.read(i);
                pageAddImage(pdfDocument, image);
            }

            pdfDocument.save(outputStream);
            return pdfDocument;
            //return outputStream.toByteArray();
        } catch (IOException e) {
            log.error("To PDF Page Error", e);
            throw new Exception("Conversion PDF Error");
        } finally {
            log.info("to pdf used time: "+(System.currentTimeMillis() - timeMillis));
        }
    }

    /**
     * 向PDDocument newPdf 中添加图片
     * @param newPdf
     * @param image
     * @throws IOException
     */
    private static void pageAddImage(PDDocument newPdf, BufferedImage image) throws IOException {
        //PDPage page = new PDPage(PDRectangle.A4);
        PDPage page = new PDPage();
        newPdf.addPage(page);

        float width = page.getMediaBox().getWidth();
        float height = page.getMediaBox().getHeight();
        float scale = page.getMediaBox().getWidth() / image.getWidth();
        scale = Math.min(1, scale);

        float imgWidth = image.getWidth() * scale;
        float imgHeight = image.getHeight() * scale;

        try (PDPageContentStream pageContentStream = new PDPageContentStream(newPdf, page)) {
            PDImageXObject pdImage = LosslessFactory.createFromImage(newPdf, image);
            pageContentStream.drawImage(pdImage, (width - imgWidth) / 2, height - image.getHeight() * scale, imgWidth, imgHeight);
        }
    }

    /**
     * word转pdf  会出现内容丢失的情况
     * @param docFilePath
     * @param pdfFilePath
     * @throws Exception
     */
    private static void wordToPdf(String docFilePath,String pdfFilePath) throws Exception {
        InputStream docFile = new FileInputStream(docFilePath);
        XWPFDocument doc = new XWPFDocument(docFile);
        PdfOptions pdfOptions = PdfOptions.create();
        OutputStream out = new FileOutputStream(pdfFilePath);
        PdfConverter.getInstance().convert(doc, out, pdfOptions);
        doc.close();
        out.close();
        System.out.println(pdfFilePath);
    }
}

参考博文:

用 Java 中的 PDFbox 替换或删除 PDF 中的文本 - IT屋-程序员软件开发技术分享社区

https://www.cnblogs.com/tankqiu/articles/4246776.html

教程 - PDFBox 中文文档 - 文江博客 

Word转为PDF(Java实现)_chengp919的博客-CSDN博客_java word转pdf

 

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
Java 通过 pdfbox 库可以实现图片PDF 的功能,同时也可以通过该库对 PDF 中的图片进行压缩。以下是一个示例代码: ```java import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import java.io.File; import java.io.IOException; public class ImageToPdf { public static void main(String[] args) throws IOException { // 创建一个空白的PDF文档 PDDocument document = new PDDocument(); // 加载要转换为PDF图片文件 File imageFile = new File("image.jpg"); // 创建一个页面对象 PDPage page = new PDPage(); // 将图片转换为PDF图像对象 PDImageXObject image = JPEGFactory.createFromImage(document, ImageIO.read(imageFile), 0.5f); // 在页面上添加图像 page.getCropBox(); page.setMediaBox(image.getCropBox()); page.setRotation(0); page.setArtBox(image.getCropBox()); page.setBleedBox(image.getCropBox()); page.setTrimBox(image.getCropBox()); PDPageContentStream contentStream = new PDPageContentStream(document, page); contentStream.drawImage(image, 0, 0); // 关闭页面内容流 contentStream.close(); // 将页面添加到文档中 document.addPage(page); // 保存PDF文件 document.save("image.pdf"); // 关闭PDF文档 document.close(); } } ``` 如果需要在压缩 PDF 中的图片,可以使用下面的代码对 PDF 中的所有图片进行压缩: ```java import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.pdmodel.graphics.image.JPEGFactory; import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; import java.io.File; import java.io.IOException; public class CompressPdfImages { public static void main(String[] args) throws IOException { // 加载PDF文件 PDDocument document = PDDocument.load(new File("input.pdf")); // 遍历所有页面 for (PDPage page : document.getPages()) { // 获取页面中的所有图像 for (PDImageXObject image : page.getResources().getXObjectNames().stream() .map(x -> page.getResources().getXObject((COSName) x)) .filter(x -> x instanceof PDImageXObject) .map(x -> (PDImageXObject) x) .toArray(PDImageXObject[]::new)) { // 如果图像是JPEG格式,则进行压缩 if ("jpg".equals(image.getSuffix())) { PDImageXObject compressedImage = JPEGFactory.createFromImage(document, image.getImage(), 0.5f); page.getResources().add(compressedImage); page.getResources().remove(image.getName()); } // 如果图像是PNG格式,则进行压缩 else if ("png".equals(image.getSuffix())) { PDImageXObject compressedImage = LosslessFactory.createFromImage(document, image.getImage(), 0.5f); page.getResources().add(compressedImage); page.getResources().remove(image.getName()); } } } // 保存压缩后的PDF文件 document.save("output.pdf"); // 关闭PDF文档 document.close(); } } ``` 如果需要将多个 PDF 文件合并为一个 PDF 文件,可以使用下面的代码: ```java import org.apache.pdfbox.multipdf.PDFMergerUtility; import java.io.File; import java.io.IOException; public class MergePdfFiles { public static void main(String[] args) throws IOException { // 创建一个PDF合并工具对象 PDFMergerUtility merger = new PDFMergerUtility(); // 添加要合并PDF文件 merger.addSource(new File("file1.pdf")); merger.addSource(new File("file2.pdf")); merger.addSource(new File("file3.pdf")); // 合并PDF文件 merger.setDestinationFileName("merged.pdf"); merger.mergeDocuments(null); } } ``` 以上是 Java 中使用 pdfbox 库实现图片PDF、压缩 PDF图片合并多个 PDF 文件为一个 PDF 文件的示例代码。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值