网上也有PDF去除水印的文章、方法、和代码,Github上也有,但是这些都是去除以图片为主的水印。一般情况下PDF水印均是斜体,印于文档的底部,按照Github或网上的文章根本无法去除,也不是一个正确的去除办法。这里要说的是一个正确去除水印并已经在实际运行的项目中使用的方法。
斜体水印并不是图片,因此不能通过检测PDF中的图片来删除水印。这种水印其实本身是文字,要用清除文字的方式来清除。主要思路是检测PDF中文字的倾斜度来检测水印,然后进行清除。下面给出源代码。
WatermarkScancer.java 水印检测类,用于检测PDF中的水印,并将检测到的文字保存到缓存中。
WatermarkRemover.java 水印清除类,用于清除PDF中的水印。
WatermarkProcessor.java 水印清除器类,用于执行任务。
本文采用并行处理,可处理多页PDF的去水印。
import java.io.OutputStream;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Vector;
import java.util.concurrent.CompletableFuture;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDStream;
public class WatermarkProcessor extends BaseWatermarkRemover implements IWatermarkProcessor {
@Override
public void init(PDDocument document) {
super.init(document);
// 扫描PDF文档,检查是否包含水印
CompletableFuture<Void> checkerTask = CompletableFuture.runAsync(() -> {
WatermarkChecker checker = new WatermarkChecker(WatermarkProcessor.this);
checker.run();
});
CompletableFuture.allOf(checkerTask).join();
// 扫描PDF文档,获取所有水印,如果超过3页,则启动多线程并行扫描
int threadCount = getThreadCount();
CompletableFuture<?>[] scancerTasks = new CompletableFuture<?>[threadCount];
for (int i = 0; i < threadCount; i++) {
final int pageStart = i * 3;
scancerTasks[i] = CompletableFuture.runAsync(() -> {
WatermarkScancer scancer = new WatermarkScancer(WatermarkProcessor.this, pageStart, 3);
scancer.run();
});
}
CompletableFuture.allOf(scancerTasks).join();
}
/**
* 清除水印的实现 当超过3页时,本方法采用多线程执行,并行清除页面水印,以提高效率。
*/
@Override
public void removeWatermark() throws Exception {
int threadCount = getThreadCount();
CompletableFuture<?>[] removerTasks = new CompletableFuture<?>[threadCount];
final Vector<RemoveResult> removeResults = new Vector<>();
for (int i = 0; i < threadCount; i++) {
final int pageStart = i * 3;
removerTasks[i] = CompletableFuture.runAsync(() -> {
WatermarkRemover remover = new WatermarkRemover(WatermarkProcessor.this, pageStart, 3, null);
remover.removeWatermark();
removeResults.addAll(remover.getPageTokens());
});
}
CompletableFuture.allOf(removerTasks).join();
// 对所有结果进行排序
Collections.sort(removeResults, new Comparator<RemoveResult>() {
@Override
public int compare(RemoveResult o1, RemoveResult o2) {
return o1.getPageNo() - o2.getPageNo();
}
});
// 执行完毕后统一进行回写处理
for (RemoveResult result : removeResults) {
PDStream updatedStream = new PDStream(document);
OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);
ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
tokenWriter.writeTokens(result.getTokens());
out.close();
result.getPage().setContents(updatedStream);
}
}
@Override
public void removeWatermark(List<String> watermarks) throws Exception {
int threadCount = getThreadCount();
CompletableFuture<?>[] removerTasks = new CompletableFuture<?>[threadCount];
final Vector<RemoveResult> removeResults = new Vector<>();
for (int i = 0; i < threadCount; i++) {
final int pageStart = i * 3;
removerTasks[i] = CompletableFuture.runAsync(() -> {
WatermarkRemover remover = new WatermarkRemover(WatermarkProcessor.this, pageStart, 3, watermarks);
remover.removeWatermark();
removeResults.addAll(remover.getPageTokens());
});
}
CompletableFuture.allOf(removerTasks).join();
// 对所有结果进行排序
Collections.sort(removeResults, new Comparator<RemoveResult>() {
@Override
public int compare(RemoveResult o1, RemoveResult o2) {
return o1.getPageNo() - o2.getPageNo();
}
});
// 执行完毕后统一进行回写处理
for (RemoveResult result : removeResults) {
PDStream updatedStream = new PDStream(document);
OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);
ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
tokenWriter.writeTokens(result.getTokens());
out.close();
result.getPage().setContents(updatedStream);
}
}
private int getThreadCount() {
return new Double(Math.ceil(document.getNumberOfPages() / 3d)).intValue();
}
}
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdmodel.PDPage;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class WatermarkRemover {
Logger logger = LoggerFactory.getLogger(WatermarkRemover.class);
IWatermarkProcessor remover;
List<RemoveResult> pageTokens = new ArrayList<>();
List<String> watermarks = null;
int pageStartIndex;
int pageLength;
public WatermarkRemover(IWatermarkProcessor remover, int pageStartIndex, int pageLength, List<String> watermarks) {
this.remover = remover;
this.pageStartIndex = pageStartIndex;
this.pageLength = pageLength;
this.watermarks = watermarks;
}
public void removeWatermark() {
for (int i = pageStartIndex; i < pageStartIndex + pageLength; i++) {
if (i >= remover.getDocument().getNumberOfPages()) {
break;
}
try {
processPage(i, remover.getDocument().getPage(i));
} catch (Exception e) {
logger.error("【解析PDF页面失败】", e);
}
}
}
public void processPage(int index, PDPage page) throws Exception {
Object next;
Operator op;
PDFStreamParser parser = new PDFStreamParser(page);
parser.parse();
List<?> tokens = parser.getTokens();
if (Objects.nonNull(tokens)) {
for (int j = 0; j < tokens.size(); j++) {
next = tokens.get(j);
if (Objects.isNull(next))
continue;
if (next instanceof Operator) {
op = (Operator) next;
if (op.getName().equals("Tj")) {
COSString previous = (COSString) tokens.get(j - 1);
String string = previous.getString();
if (Utils.isISO8859_1Charset(string)) {
string = new String(string.getBytes("ISO8859-1"), "GBK");
}
// 判断是否是水印
if (null != watermarks && watermarks.contains(string)) {
previous.setValue("".getBytes("GBK"));
} else if (remover.isWatermarkWord(string)) {
// 判断是否是水印
previous.setValue("".getBytes("GBK"));
}
}
}
}
}
RemoveResult pageResult = new RemoveResult(page, index, tokens);
pageTokens.add(pageResult);
}
public List<RemoveResult> getPageTokens() {
return pageTokens;
}
static class RemoveResult {
PDPage page;
int pageNo;
List<?> tokens;
public RemoveResult(PDPage page, int pageNo, List<?> tokens) {
this.page = page;
this.pageNo = pageNo;
this.tokens = tokens;
}
public PDPage getPage() {
return page;
}
public void setPage(PDPage page) {
this.page = page;
}
public int getPageNo() {
return pageNo;
}
public void setPageNo(int pageNo) {
this.pageNo = pageNo;
}
public List<?> getTokens() {
return tokens;
}
public void setTokens(List<?> tokens) {
this.tokens = tokens;
}
}
}
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.DrawObject;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.state.Concatenate;
import org.apache.pdfbox.contentstream.operator.state.Restore;
import org.apache.pdfbox.contentstream.operator.state.Save;
import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters;
import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.util.Matrix;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class WatermarkScancer extends PDFStreamEngine {
Logger logger = LoggerFactory.getLogger(WatermarkScancer.class);
IWatermarkProcessor remover;
int pageStartIndex;
int pageLength;
public WatermarkScancer(IWatermarkProcessor remover, int pageStartIndex, int pageLength) {
addOperator(new Concatenate());
addOperator(new DrawObject());
addOperator(new SetGraphicsStateParameters());
addOperator(new Save());
addOperator(new Restore());
addOperator(new SetMatrix());
this.remover = remover;
this.pageStartIndex = pageStartIndex;
this.pageLength = pageLength;
}
/**
* 开始扫描,检查所有水印
*/
public void run() {
try {
for (int i = pageStartIndex; i < pageStartIndex + pageLength; i++) {
if (i >= remover.getDocument().getNumberOfPages()) {
break;
}
processPage(remover.getDocument().getPage(i));
}
} catch (Exception e) {
logger.error("【扫描页面水印出错】", e);
}
}
/**
* 处理读取的每一个点位
*/
@Override
protected void processOperator(Operator operator, List<COSBase> operands) throws IOException {
String operation = operator.getName();
if ("Tj".equals(operation)) {
COSString textObj = (COSString) operands.get(0);
String string = textObj.getString();
if (Utils.isISO8859_1Charset(string)) {
string = new String(string.getBytes("ISO8859-1"), "GBK");
}
// 检查是否是倾斜的水印
Matrix matrix = getTextLineMatrix();
if (matrix != null && matrix.getScaleY() != 0 && matrix.getScaleY() != 1 && matrix.getShearY() != 0) {
if (!remover.isWatermarkWord(string)) {
remover.addWatermarkWord(string);
}
}
} else {
// 此代码必须,必须对else进行处理
super.processOperator(operator, operands);
}
}
}
public static void main (String [] args){
String pdfPath = "d:/test.pdf";
PDDocument document = PDDocument.load(pdfPath);
WatermarkProcessor processor = new WatermarkProcessor();
processor.init(document);
if (processor.isWatermarkPDF()) {
// 去除水印
processor.removeWatermark();
}
}