去除PDF文件中的斜体文字水印

该博客介绍了如何正确去除PDF文件中的文字型水印,特别是那些倾斜的水印。通过使用Apache PDFBox库,创建了WatermarkScancer、WatermarkRemover和WatermarkProcessor类来检测和删除水印。方法包括检测文字倾斜度,多线程并行处理以提高效率,尤其适合处理多页PDF。代码示例展示了具体的实现过程。
摘要由CSDN通过智能技术生成

    网上也有PDF去除水印的文章、方法、和代码,Github上也有,但是这些都是去除以图片为主的水印。一般情况下PDF水印均是斜体,印于文档的底部,按照Github或网上的文章根本无法去除,也不是一个正确的去除办法。这里要说的是一个正确去除水印并已经在实际运行的项目中使用的方法。

    斜体水印并不是图片,因此不能通过检测PDF中的图片来删除水印。这种水印其实本身是文字,要用清除文字的方式来清除。主要思路是检测PDF中文字的倾斜度来检测水印,然后进行清除。下面给出源代码。

WatermarkScancer.java 水印检测类,用于检测PDF中的水印,并将检测到的文字保存到缓存中。

WatermarkRemover.java 水印清除类,用于清除PDF中的水印。

WatermarkProcessor.java 水印清除器类,用于执行任务。

本文采用并行处理,可处理多页PDF的去水印。

import java.io.OutputStream;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Vector;
import java.util.concurrent.CompletableFuture;

import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDStream;

public class WatermarkProcessor extends BaseWatermarkRemover implements IWatermarkProcessor {
	@Override
	public void init(PDDocument document) {
		super.init(document);

		// 扫描PDF文档,检查是否包含水印
		CompletableFuture<Void> checkerTask = CompletableFuture.runAsync(() -> {
			WatermarkChecker checker = new WatermarkChecker(WatermarkProcessor.this);
			checker.run();
		});
		CompletableFuture.allOf(checkerTask).join();

		// 扫描PDF文档,获取所有水印,如果超过3页,则启动多线程并行扫描
		int threadCount = getThreadCount();

		CompletableFuture<?>[] scancerTasks = new CompletableFuture<?>[threadCount];
		for (int i = 0; i < threadCount; i++) {
			final int pageStart = i * 3;
			scancerTasks[i] = CompletableFuture.runAsync(() -> {
				WatermarkScancer scancer = new WatermarkScancer(WatermarkProcessor.this, pageStart, 3);
				scancer.run();
			});
		}
		CompletableFuture.allOf(scancerTasks).join();
	}

	/**
	 * 清除水印的实现 当超过3页时,本方法采用多线程执行,并行清除页面水印,以提高效率。
	 */
	@Override
	public void removeWatermark() throws Exception {
		int threadCount = getThreadCount();
		CompletableFuture<?>[] removerTasks = new CompletableFuture<?>[threadCount];
		final Vector<RemoveResult> removeResults = new Vector<>();

		for (int i = 0; i < threadCount; i++) {
			final int pageStart = i * 3;
			removerTasks[i] = CompletableFuture.runAsync(() -> {
				WatermarkRemover remover = new WatermarkRemover(WatermarkProcessor.this, pageStart, 3, null);
				remover.removeWatermark();
				removeResults.addAll(remover.getPageTokens());
			});
		}
		CompletableFuture.allOf(removerTasks).join();

		// 对所有结果进行排序
		Collections.sort(removeResults, new Comparator<RemoveResult>() {
			@Override
			public int compare(RemoveResult o1, RemoveResult o2) {
				return o1.getPageNo() - o2.getPageNo();
			}
		});

		// 执行完毕后统一进行回写处理
		for (RemoveResult result : removeResults) {
			PDStream updatedStream = new PDStream(document);
			OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);
			ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
			tokenWriter.writeTokens(result.getTokens());
			out.close();
			result.getPage().setContents(updatedStream);
		}
	}

	@Override
	public void removeWatermark(List<String> watermarks) throws Exception {
		int threadCount = getThreadCount();
		CompletableFuture<?>[] removerTasks = new CompletableFuture<?>[threadCount];
		final Vector<RemoveResult> removeResults = new Vector<>();

		for (int i = 0; i < threadCount; i++) {
			final int pageStart = i * 3;
			removerTasks[i] = CompletableFuture.runAsync(() -> {
				WatermarkRemover remover = new WatermarkRemover(WatermarkProcessor.this, pageStart, 3, watermarks);
				remover.removeWatermark();
				removeResults.addAll(remover.getPageTokens());
			});
		}
		CompletableFuture.allOf(removerTasks).join();

		// 对所有结果进行排序
		Collections.sort(removeResults, new Comparator<RemoveResult>() {
			@Override
			public int compare(RemoveResult o1, RemoveResult o2) {
				return o1.getPageNo() - o2.getPageNo();
			}
		});

		// 执行完毕后统一进行回写处理
		for (RemoveResult result : removeResults) {
			PDStream updatedStream = new PDStream(document);
			OutputStream out = updatedStream.createOutputStream(COSName.FLATE_DECODE);
			ContentStreamWriter tokenWriter = new ContentStreamWriter(out);
			tokenWriter.writeTokens(result.getTokens());
			out.close();
			result.getPage().setContents(updatedStream);
		}
	}

	private int getThreadCount() {
		return new Double(Math.ceil(document.getNumberOfPages() / 3d)).intValue();
	}
}
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;

import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdmodel.PDPage;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WatermarkRemover {
	Logger logger = LoggerFactory.getLogger(WatermarkRemover.class);

	IWatermarkProcessor remover;
	List<RemoveResult> pageTokens = new ArrayList<>();
	List<String> watermarks = null;
	int pageStartIndex;
	int pageLength;

	public WatermarkRemover(IWatermarkProcessor remover, int pageStartIndex, int pageLength, List<String> watermarks) {
		this.remover = remover;
		this.pageStartIndex = pageStartIndex;
		this.pageLength = pageLength;
		this.watermarks = watermarks;
	}

	public void removeWatermark() {
		for (int i = pageStartIndex; i < pageStartIndex + pageLength; i++) {
			if (i >= remover.getDocument().getNumberOfPages()) {
				break;
			}
			try {
				processPage(i, remover.getDocument().getPage(i));
			} catch (Exception e) {
				logger.error("【解析PDF页面失败】", e);
			}
		}
	}

	public void processPage(int index, PDPage page) throws Exception {
		Object next;
		Operator op;

		PDFStreamParser parser = new PDFStreamParser(page);
		parser.parse();
		List<?> tokens = parser.getTokens();
		if (Objects.nonNull(tokens)) {
			for (int j = 0; j < tokens.size(); j++) {
				next = tokens.get(j);
				if (Objects.isNull(next))
					continue;

				if (next instanceof Operator) {
					op = (Operator) next;

					if (op.getName().equals("Tj")) {
						COSString previous = (COSString) tokens.get(j - 1);
						String string = previous.getString();

						if (Utils.isISO8859_1Charset(string)) {
							string = new String(string.getBytes("ISO8859-1"), "GBK");
						}

						// 判断是否是水印
						if (null != watermarks && watermarks.contains(string)) {
							previous.setValue("".getBytes("GBK"));
						} else if (remover.isWatermarkWord(string)) {
							// 判断是否是水印
							previous.setValue("".getBytes("GBK"));
						}
					}
				}
			}
		}

		RemoveResult pageResult = new RemoveResult(page, index, tokens);
		pageTokens.add(pageResult);
	}

	public List<RemoveResult> getPageTokens() {
		return pageTokens;
	}

	static class RemoveResult {
		PDPage page;
		int pageNo;
		List<?> tokens;

		public RemoveResult(PDPage page, int pageNo, List<?> tokens) {
			this.page = page;
			this.pageNo = pageNo;
			this.tokens = tokens;
		}

		public PDPage getPage() {
			return page;
		}

		public void setPage(PDPage page) {
			this.page = page;
		}

		public int getPageNo() {
			return pageNo;
		}

		public void setPageNo(int pageNo) {
			this.pageNo = pageNo;
		}

		public List<?> getTokens() {
			return tokens;
		}

		public void setTokens(List<?> tokens) {
			this.tokens = tokens;
		}
	}
}
import org.apache.pdfbox.contentstream.PDFStreamEngine;
import org.apache.pdfbox.contentstream.operator.DrawObject;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.state.Concatenate;
import org.apache.pdfbox.contentstream.operator.state.Restore;
import org.apache.pdfbox.contentstream.operator.state.Save;
import org.apache.pdfbox.contentstream.operator.state.SetGraphicsStateParameters;
import org.apache.pdfbox.contentstream.operator.state.SetMatrix;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.util.Matrix;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class WatermarkScancer extends PDFStreamEngine {
	Logger logger = LoggerFactory.getLogger(WatermarkScancer.class);

	IWatermarkProcessor remover;
	int pageStartIndex;
	int pageLength;

	public WatermarkScancer(IWatermarkProcessor remover, int pageStartIndex, int pageLength) {
		addOperator(new Concatenate());
		addOperator(new DrawObject());
		addOperator(new SetGraphicsStateParameters());
		addOperator(new Save());
		addOperator(new Restore());
		addOperator(new SetMatrix());
		this.remover = remover;
		this.pageStartIndex = pageStartIndex;
		this.pageLength = pageLength;
	}

	/**
	 * 开始扫描,检查所有水印
	 */
	public void run() {
		try {
			for (int i = pageStartIndex; i < pageStartIndex + pageLength; i++) {
				if (i >= remover.getDocument().getNumberOfPages()) {
					break;
				}
				processPage(remover.getDocument().getPage(i));
			}
		} catch (Exception e) {
			logger.error("【扫描页面水印出错】", e);
		}
	}

	/**
	 * 处理读取的每一个点位
	 */
	@Override
	protected void processOperator(Operator operator, List<COSBase> operands) throws IOException {
		String operation = operator.getName();
		if ("Tj".equals(operation)) {
			COSString textObj = (COSString) operands.get(0);
			String string = textObj.getString();

			if (Utils.isISO8859_1Charset(string)) {
				string = new String(string.getBytes("ISO8859-1"), "GBK");
			}

			// 检查是否是倾斜的水印
			Matrix matrix = getTextLineMatrix();

			if (matrix != null && matrix.getScaleY() != 0 && matrix.getScaleY() != 1 && matrix.getShearY() != 0) {
				if (!remover.isWatermarkWord(string)) {
					remover.addWatermarkWord(string);
				}
			}
		} else {
			// 此代码必须,必须对else进行处理
			super.processOperator(operator, operands);
		}
	}
}
public static void main (String [] args){
    String pdfPath = "d:/test.pdf";
    PDDocument document = PDDocument.load(pdfPath);
    WatermarkProcessor processor = new WatermarkProcessor();
    processor.init(document);
    if (processor.isWatermarkPDF()) {
        // 去除水印
		processor.removeWatermark();
    }
}

评论 15
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

weixin_44214515

你的鼓励是我分享的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值