Need to replace the text in the pdf with different language. In the first step, I was trying to search and replace a text in the pdf file using itextpdf ad pdfbox API.
Use the below code snippet which uses itextpdf api to search and replace the text "Hello" to "Hi" from the source PDF file. The new PDF is created without any text replacements.
public void manipulatePdf(String src, String dest) throws Exception {
PdfDocument pdfDoc = new PdfDocument(new PdfReader(SRC), new PdfWriter(DEST));
int noOfPages = pdfDoc.getNumberOfPages();
for (int i = 1; i < noOfPages; i++) {
PdfPage page = pdfDoc.getPage(i);
PdfDictionary dict = page.getPdfObject();
PdfObject object = dict.get(PdfName.Contents);
if (object instanceof PdfStream) {
PdfStream stream = (PdfStream) object;
byte[] data = stream.getBytes();
stream.setData(new String(data).replace("Hello", "Hi").getBytes("UTF-8"));
}
}
pdfDoc.close();
}
Also used apache pdfbox to achieve the same thing but no luck in that. Below is the code snippet for the reference.
public static PDDocument replaceText(PDDocument document, String searchString, String replacement)
throws IOException {
for (PDPage page : document.getPages()) {
PDFStreamParser parser = new PDFStreamParser(page);
parser.parse();
List tokens = parser.getTokens();
for (int j = 0; j < tokens.size(); j++) {
Object next = tokens.get(j);
if (next instanceof Operator) {
Operator op = (Operator) next;
// Tj and TJ are the two operators that display strings in a PDF
if (op.getName().equals("Tj")) {
// Tj takes one operator and that is the string to display
// so lets update that operator
COSString previous = (COSString) tokens.get(j - 1);
String string = previous.getString();
//System.out.println(new String(string.getBytes(StandardCharsets.UTF_8), StandardCharsets.UTF_8));
string = string.replaceFirst(searchString, replacement);
previous.setValue(string.getBytes());
} else if (op.getName().equals("TJ")) {
COSArray previous = (COSArray) tokens.get(j - 1);
for (int k = 0; k < previous.size(); k++) {
Object arrElement = previous.getObject(k);
if (arrElement instanceof COSString) {
COSString cosString = (COSString) arrElement;
String string = cosString.getString();
//System.out.println("22::"+new String(string.getBytes(StandardCharsets.UTF_8), StandardCharsets.UTF_8));
string = StringUtils.replaceOnce(string, searchString, replacement);
cosString.setValue(string.getBytes());
}
}
}
}
}
PDStream updatedStream = new PDStream(document);
OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);
ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
tokenWriter.writeTokens(tokens);
// save content
page.setContents(updatedStream);
out.close();
}
Any solution/suggestion is highly appreciated.
解决方案
This is a working version, uses PDFBox
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
public final class PDFEditor {
private PDFEditor() {
}
public static void main(String[] args) throws IOException {
PDDocument document = null;
document = PDDocument.load(new File("src path"));
document = replaceText(document, "Hello", "Hi");
document.save("target Path");
document.close();
}
private static PDDocument replaceText(PDDocument document, String searchString, String replacement) throws IOException {
if (StringUtils.isEmpty(searchString) || StringUtils.isEmpty(replacement)) {
return document;
}
for (PDPage page : document.getPages()) {
PDFStreamParser parser = new PDFStreamParser(page);
parser.parse();
List> tokens = parser.getTokens();
for (int j = 0; j < tokens.size(); j++) {
Object next = tokens.get(j);
if (next instanceof Operator) {
Operator op = (Operator) next;
String pstring = "";
int prej = 0;
if (op.getName().equals("Tj")) {
COSString previous = (COSString) tokens.get(j - 1);
String string = previous.getString();
string = string.replaceFirst(searchString, replacement);
previous.setValue(string.getBytes());
} else if (op.getName().equals("TJ")) {
COSArray previous = (COSArray) tokens.get(j - 1);
for (int k = 0; k < previous.size(); k++) {
Object arrElement = previous.getObject(k);
if (arrElement instanceof COSString) {
COSString cosString = (COSString) arrElement;
String string = cosString.getString();
if (j == prej) {
pstring += string;
} else {
prej = j;
pstring = string;
}
}
}
if (searchString.equals(pstring.trim())) {
COSString cosString2 = (COSString) previous.getObject(0);
cosString2.setValue(replacement.getBytes());
int total = previous.size() - 1;
for (int k = total; k > 0; k--) {
previous.remove(k);
}
}
}
}
}
PDStream updatedStream = new PDStream(document);
OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);
ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
tokenWriter.writeTokens(tokens);
out.close();
page.setContents(updatedStream);
}
return document;
}
}
Dependencies :
com.itextpdf
itextpdf
5.0.6
org.apache.pdfbox
pdfbox
2.0.11
org.apache.commons
commons-lang3
3.0