java 替换pdf 文本,使用JAVA搜索和替换PDF中的文本

Need to replace the text in the pdf with different language. In the first step, I was trying to search and replace a text in the pdf file using itextpdf ad pdfbox API.

Use the below code snippet which uses itextpdf api to search and replace the text "Hello" to "Hi" from the source PDF file. The new PDF is created without any text replacements.

public void manipulatePdf(String src, String dest) throws Exception {

PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC), new PdfWriter(DEST));

int noOfPages = pdfDoc.getNumberOfPages();

for (int i = 1; i < noOfPages; i++) {

PdfPage page = pdfDoc.getPage(i);

PdfDictionary dict = page.getPdfObject();

PdfObject object = dict.get(PdfName.Contents);

if (object instanceof PdfStream) {

PdfStream stream = (PdfStream) object;

byte[] data = stream.getBytes();

stream.setData(new String(data).replace("Hello", "Hi").getBytes("UTF-8"));

}

}

pdfDoc.close();

}

Also used apache pdfbox to achieve the same thing but no luck in that. Below is the code snippet for the reference.

public static PDDocument replaceText(PDDocument document, String searchString, String replacement)

throws IOException {

for (PDPage page : document.getPages()) {

PDFStreamParser parser = new PDFStreamParser(page);

parser.parse();

List tokens = parser.getTokens();

for (int j = 0; j < tokens.size(); j++) {

Object next = tokens.get(j);

if (next instanceof Operator) {

Operator op = (Operator) next;

// Tj and TJ are the two operators that display strings in a PDF

if (op.getName().equals("Tj")) {

// Tj takes one operator and that is the string to display

// so lets update that operator

COSString previous = (COSString) tokens.get(j - 1);

String string = previous.getString();

//System.out.println(new String(string.getBytes(StandardCharsets.UTF_8), StandardCharsets.UTF_8));

string = string.replaceFirst(searchString, replacement);

previous.setValue(string.getBytes());

} else if (op.getName().equals("TJ")) {

COSArray previous = (COSArray) tokens.get(j - 1);

for (int k = 0; k < previous.size(); k++) {

Object arrElement = previous.getObject(k);

if (arrElement instanceof COSString) {

COSString cosString = (COSString) arrElement;

String string = cosString.getString();

//System.out.println("22::"+new String(string.getBytes(StandardCharsets.UTF_8), StandardCharsets.UTF_8));

string = StringUtils.replaceOnce(string, searchString, replacement);

cosString.setValue(string.getBytes());

}

}

}

}

}

PDStream updatedStream = new PDStream(document);

OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);

ContentStreamWriter tokenWriter = new ContentStreamWriter(out);

tokenWriter.writeTokens(tokens);

// save content

page.setContents(updatedStream);

out.close();

}

Any solution/suggestion is highly appreciated.

解决方案

This is a working version, uses PDFBox

import java.io.File;

import java.io.IOException;

import java.io.OutputStream;

import java.util.List;

import org.apache.commons.lang3.StringUtils;

import org.apache.pdfbox.contentstream.operator.Operator;

import org.apache.pdfbox.cos.COSArray;

import org.apache.pdfbox.cos.COSName;

import org.apache.pdfbox.cos.COSString;

import org.apache.pdfbox.pdfparser.PDFStreamParser;

import org.apache.pdfbox.pdfwriter.ContentStreamWriter;

import org.apache.pdfbox.pdmodel.PDDocument;

import org.apache.pdfbox.pdmodel.PDPage;

import org.apache.pdfbox.pdmodel.common.PDStream;

public final class PDFEditor {

private PDFEditor() {

}

public static void main(String[] args) throws IOException {

PDDocument document = null;

document = PDDocument.load(new File("src path"));

document = replaceText(document, "Hello", "Hi");

document.save("target Path");

document.close();

}

private static PDDocument replaceText(PDDocument document, String searchString, String replacement) throws IOException {

if (StringUtils.isEmpty(searchString) || StringUtils.isEmpty(replacement)) {

return document;

}

for (PDPage page : document.getPages()) {

PDFStreamParser parser = new PDFStreamParser(page);

parser.parse();

List> tokens = parser.getTokens();

for (int j = 0; j < tokens.size(); j++) {

Object next = tokens.get(j);

if (next instanceof Operator) {

Operator op = (Operator) next;

String pstring = "";

int prej = 0;

if (op.getName().equals("Tj")) {

COSString previous = (COSString) tokens.get(j - 1);

String string = previous.getString();

string = string.replaceFirst(searchString, replacement);

previous.setValue(string.getBytes());

} else if (op.getName().equals("TJ")) {

COSArray previous = (COSArray) tokens.get(j - 1);

for (int k = 0; k < previous.size(); k++) {

Object arrElement = previous.getObject(k);

if (arrElement instanceof COSString) {

COSString cosString = (COSString) arrElement;

String string = cosString.getString();

if (j == prej) {

pstring += string;

} else {

prej = j;

pstring = string;

}

}

}

if (searchString.equals(pstring.trim())) {

COSString cosString2 = (COSString) previous.getObject(0);

cosString2.setValue(replacement.getBytes());

int total = previous.size() - 1;

for (int k = total; k > 0; k--) {

previous.remove(k);

}

}

}

}

}

PDStream updatedStream = new PDStream(document);

OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);

ContentStreamWriter tokenWriter = new ContentStreamWriter(out);

tokenWriter.writeTokens(tokens);

out.close();

page.setContents(updatedStream);

}

return document;

}

}

Dependencies :

com.itextpdf

itextpdf

5.0.6

org.apache.pdfbox

pdfbox

2.0.11

org.apache.commons

commons-lang3

3.0

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值