基于pdfbox区域解析pdf银行流水

本文介绍了一种用于解析银行流水PDF文件的方法,通过定位关键坐标来提取数据,并针对广发、交通及农业银行的不同格式进行了定制化的实现。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

package 坐标;

import org.apache.commons.lang.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripperByArea;

import java.awt.*;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * @filename AbBankBillAnalysis
 * @description 银行流水解析抽象类
 * @author natsu
 * @date 2022/3/7 9:16
 */
public abstract class AbstractBankBillAnalysis {

	public static final String SEPARATOR = "\001";

	public static final String PDF_LINE_FEED = "\r\n";

	/**
	 * 功能描述 解析银行流水
	 * @param filePathName 文件路径名称
	 * @param filePassWord 文件密码
	 * @author natsu
	 * @date 2022/3/7 9:32
	 */
	protected List<String[]> analysisBankBill(String filePathName, String filePassWord) throws Exception {
		PDDocument pdDocument = readBankBillContent(filePathName, filePassWord);
		return analysisBankBillContent(pdDocument);
	}

	/**
	 * 功能描述 读取银行流水内容
	 * @param filePathName 文件路径名称
	 * @param filePassWord 文件密码
	 * @return java.lang.String
	 * @author natsu
	 * @date 2022/3/7 9:19
	 */
	protected PDDocument readBankBillContent(String filePathName, String filePassWord) throws IOException {
		//加载PDF文件
		return PDDocument.load(new File(filePathName), filePassWord);
	}

	/**
	 * 功能描述 解析银行流水内容
	 * @param pdDocument 文档对象
	 * @return java.util.List<java.lang.String [ ]>
	 * @author natsu
	 * @date 2022/3/11 17:48
	 */
	protected abstract List<String[]> analysisBankBillContent(PDDocument pdDocument) throws Exception;

	/**
	 * 功能描述 计算单页横坐标
	 * @param page 当前页对象
	 * @param startY 起始Y坐标
	 * @param column 列数
	 * @param startX 起始X坐标
	 * @param locators 起始X坐标
	 * @param pageWidth 页面宽度
	 * @return java.util.List<java.lang.Integer>
	 * @author natsu
	 * @date 2022/3/11 17:48
	 */
	public List<Integer> reckonColumnSpace(PDPage page, int startY, int column, int startX, String[] locators,
			int pageWidth) throws IOException {
		List<Integer> columnSpaces = new ArrayList<>(column);
		PDFTextStripperByArea stripper = new PDFTextStripperByArea();
		for (int i = 0; i < pageWidth; i += 5) {
			Rectangle rectangle = new Rectangle(i + startX, startY, 5, 5);
			stripper.addRegion(String.valueOf(i + startX), rectangle);
		}
		//按位置进行排序
		stripper.setSortByPosition(true);
		//提取页面信息
		stripper.extractRegions(page);
		List<String> regions = stripper.getRegions();
		for (int i = 0; i < regions.size(); i++) {
			String result = stripper.getTextForRegion(regions.get(i));

			if (StringUtils.isBlank(result))
				continue;

			for (String locator : locators) {
				if (StringUtils.contains(result, locator)) {
					//					System.out.println(result.replace(PDF_LINE_FEED, "") + "=> , 横坐标:" + regions.get(i));
					columnSpaces.add(Integer.valueOf(regions.get(i)));
					break;
				}
			}
		}
		return columnSpaces;
	}

	/**
	 * 功能描述 计算单页横坐标并自动计算列数
	 * @param page 当前页对象
	 * @param startY 起始Y坐标
	 * @param column 列数
	 * @param startX 起始X坐标
	 * @param pageWidth 页面宽度
	 * @param spaceSum 空格总数
	 * @return java.util.List<java.lang.Integer>
	 * @author natsu
	 * @date 2022/3/11 17:48
	 */
	public List<Integer> reckonColumnSpace(PDPage page, int startY, int column, int startX, int pageWidth, int spaceSum)
			throws IOException {
		List<Integer> columnSpaces = new ArrayList<>(column);
		PDFTextStripperByArea stripper = new PDFTextStripperByArea();
		for (int i = 0; i < pageWidth; i += 5) {
			Rectangle rectangle = new Rectangle(i + startX, startY, 5, 5);
			stripper.addRegion(String.valueOf(i + startX), rectangle);
		}
		//按位置进行排序
		stripper.setSortByPosition(true);
		//提取页面信息
		stripper.extractRegions(page);
		List<String> regions = stripper.getRegions();
		boolean flag = true;
		int count = 0;
		for (int i = 0; i < regions.size(); i++) {
			String result = stripper.getTextForRegion(regions.get(i));
			//						System.out.println(result.replace(PDF_LINE_FEED, "") + "=> , 横坐标:" + regions.get(i));
			if (StringUtils.isBlank(result)) {
				count++;
				if (count > spaceSum) {
					flag = true;
				}
				continue;
			}

			if (flag && StringUtils.isNotBlank(result)) {
				//				System.out.println(result.replace(PDF_LINE_FEED, "") + "=> , 横坐标:" + regions.get(i));
				columnSpaces.add(Integer.valueOf(regions.get(i)));
				flag = false;
			}
			count = 0;
		}
		return columnSpaces;
	}

	/**
	 * 功能描述 计算单页横坐标
	 * @param page 当前页对象
	 * @param startY 起始Y坐标
	 * @param rowNum 行数
	 * @param startX 起始X坐标
	 * @param removalHeight 去除高度
	 * @param excludeCharacters 排除字符
	 * @return java.util.List<java.lang.Integer>
	 * @author natsu
	 * @date 2022/3/11 17:48
	 */
	public List<Integer> reckonRowSpace(PDPage page, int startY, int rowNum, int startX, int removalHeight,
			String[] excludeCharacters) throws IOException {
		List<Integer> rowSpaces = new ArrayList<>(rowNum);
		PDFTextStripperByArea stripper = new PDFTextStripperByArea();
		for (int i = 0; i < page.getArtBox().getHeight() - removalHeight; i++) {
			Rectangle rectangle = new Rectangle(startX, startY + i, 20, 5);
			stripper.addRegion(String.valueOf(i + startY), rectangle);
		}
		//按位置进行排序
		stripper.setSortByPosition(true);
		//提取页面信息
		stripper.extractRegions(page);
		List<String> regions = stripper.getRegions();
		out:
		for (int i = 0; i < regions.size(); i++) {
			String result = stripper.getTextForRegion(regions.get(i));
			if (StringUtils.isNotBlank(result)) {
				for (String excludeCharacter : excludeCharacters) {
					if (StringUtils.contains(result, excludeCharacter)) {
						break out;
					}
				}
//				System.out.println(result.replace(PDF_LINE_FEED, "") + "=> , 纵坐标:" + regions.get(i));
				rowSpaces.add(Integer.valueOf(regions.get(i)));
			}
		}
		return rowSpaces;
	}

	public void formatData(PDFTextStripperByArea stripper, PDPage page, int column, List<String[]> results)
			throws IOException {
		//按位置进行排序
		stripper.setSortByPosition(true);
		//提取页面信息
		stripper.extractRegions(page);
		List<String> regions = stripper.getRegions();
		StringBuilder data = new StringBuilder();
		for (int i = 0; i < regions.size(); i++) {
			if (i != 0 && (i % column == 0)) {
				System.out.println(data.toString().replace(PDF_LINE_FEED, ""));
				results.add(data.toString().replace(PDF_LINE_FEED, "").split(SEPARATOR));
				data = new StringBuilder();
				data.append(stripper.getTextForRegion(regions.get(i)));
				data.append(SEPARATOR);

			} else {
				data.append(stripper.getTextForRegion(regions.get(i)));
				data.append(SEPARATOR);
				if (i == regions.size() - 1) {
					System.out.println(data.toString().replace(PDF_LINE_FEED, ""));
					results.add(data.toString().replace(PDF_LINE_FEED, "").split(SEPARATOR));
				}
			}
		}
	}

}

 广发:

package 坐标;

import com.yunsax.common.fileutils.FileUtils;
import com.yunsax.common.json.utils.GsonUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.text.PDFTextStripperByArea;

import java.awt.*;
import java.util.ArrayList;
import java.util.List;

/**
 * @filename GuangFaBankBillAnalysis
 * @description 广发银行流水解析
 * @author natsu
 * @date 2022/3/7 9:27
 */
public class GuangFaBankBillAnalysis extends AbstractBankBillAnalysis {

	@Override
	protected List<String[]> analysisBankBillContent(PDDocument pdDocument) throws Exception {
		//获取目录
		PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
		//获取页码树
		PDPageTree tree = catalog.getPages();
		// 总页数
		int countPage = tree.getCount();
		int no = 1;
		// 列数
		int column = 9;
		// 每页总数
		int pageSum = 23;

		int pageWidth = 840;

		int startYColumn = 130;
		int startYRow = 140;
		List<String[]> results = new ArrayList<>(countPage * pageSum);

		for (int pageNo = 0; pageNo < countPage; pageNo++) {
			//获取指定页,从0开始
			PDPage page = tree.get(pageNo);
			// 计算X坐标
			List<Integer> columnSpaces = reckonColumnSpace(page, startYColumn, column, 0, pageWidth, 2);

			column = columnSpaces.size();

			// 特殊处理
			int columnSpace = columnSpaces.get(0) - columnSpaces.get(0) / 3 ;
			columnSpaces.remove(0);
			columnSpaces.add(0, columnSpace);

			// 计算Y坐标
			List<Integer> rowSpaces = reckonRowSpace(page, startYRow, pageSum, columnSpaces.get(0), 200,
					new String[] { "期" });

			columnSpace = columnSpaces.get(1) - (columnSpaces.get(1) - columnSpaces.get(0)) / 4;
			columnSpaces.remove(1);
			columnSpaces.add(1, columnSpace);

			columnSpace = columnSpaces.get(2) - (columnSpaces.get(2) - columnSpaces.get(1)) / 4;
			columnSpaces.remove(2);
			columnSpaces.add(2, columnSpace);

			columnSpace = columnSpaces.get(3) - (columnSpaces.get(3) - columnSpaces.get(2)) / 4;
			columnSpaces.remove(3);
			columnSpaces.add(3, columnSpace);

			columnSpace = columnSpaces.get(4) - (columnSpaces.get(4) - columnSpaces.get(3)) / 3;
			columnSpaces.remove(4);
			columnSpaces.add(4, columnSpace);

			columnSpace = columnSpaces.get(5) - (columnSpaces.get(5) - columnSpaces.get(4)) / 6;
			columnSpaces.remove(5);
			columnSpaces.add(5, columnSpace);

			columnSpace = columnSpaces.get(6) - (columnSpaces.get(6) - columnSpaces.get(5)) / 5;
			columnSpaces.remove(6);
			columnSpaces.add(6, columnSpace);

			//按区域读取文本剥离器
			PDFTextStripperByArea stripper = new PDFTextStripperByArea();
			Rectangle rectangle;
			for (int j = 0; j < rowSpaces.size(); j++) {
				int rowHeight = (j == rowSpaces.size() - 1) ? 28 : rowSpaces.get(j + 1) - rowSpaces.get(j);
				for (int i = 0; i < columnSpaces.size(); i++) {
					// 交易日期#交易地点#交易方式#借贷状态#交易金额#余额
					int columnWidth = (i == columnSpaces.size() - 1) ?
							pageWidth - columnSpaces.get(i) :
							columnSpaces.get(i + 1) - columnSpaces.get(i);
					rectangle = new Rectangle(columnSpaces.get(i), rowSpaces.get(j) - (rowHeight / 3), columnWidth,
							rowHeight);
					stripper.addRegion("regionName" + no, rectangle);
					no += 1;
				}
			}
			formatData(stripper, page, column, results);
			System.out.println("总数:" + results.size());
			no = 1;
		}
		return results;
	}

	public static void main(String[] args) throws Exception {

		AbstractBankBillAnalysis abstractBankBillAnalysis = new GuangFaBankBillAnalysis();

		String filePathName;
		String filePassWord;
		List<String[]> results;

		long beginTime = System.currentTimeMillis();
		filePathName = "D:\\银行流水\\2.广发银行.pdf";
		filePassWord = "密码";
		results = abstractBankBillAnalysis.analysisBankBill(filePathName, filePassWord);
		System.out.println("耗时:" + (System.currentTimeMillis() - beginTime));
		FileUtils.deleteFile(filePathName.concat(".txt"));
		FileUtils.writeFile(filePathName.concat(".txt"), results, true, SEPARATOR);

	}

}

交通:

package 坐标;

import com.yunsax.common.fileutils.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.text.PDFTextStripperByArea;

import java.awt.*;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * @filename JiaoTongBankBillAnalysis
 * @description 交通银行流水解析
 * @author natsu
 * @date 2022/3/7 9:27
 */
public class JiaoTongBankBillAnalysis extends AbstractBankBillAnalysis {

	@Override
	protected List<String[]> analysisBankBillContent(PDDocument pdDocument) throws Exception {
		//获取目录
		PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
		//获取页码树
		PDPageTree tree = catalog.getPages();
		// 总页数
		int countPage = tree.getCount();
		int no = 1;
		// 列数
		int column = 0;
		// 每页总数
		int pageSum = 33;

		int startYColumn = 210;
		int startYRow = 220;
		List<String[]> results = new ArrayList<>(countPage * pageSum);
		for (int pageNo = 0; pageNo < countPage; pageNo++) {
			//获取指定页,从0开始
			PDPage page = tree.get(pageNo);
			// 计算X坐标
			List<Integer> columnSpaces = reckonColumnSpace(page, startYColumn, column, 40, 600, 4);
			column = columnSpaces.size();
			// 计算Y坐标
			List<Integer> rowSpaces = reckonRowSpace(page, startYRow, pageSum, 40, 0, new String[]{"方发"});
			//按区域读取文本剥离器
			PDFTextStripperByArea stripper = new PDFTextStripperByArea();
			Rectangle rectangle;
			for (int j = 0; j < rowSpaces.size(); j++) {
				int rowHeight = (j == rowSpaces.size() - 1) ? 15 : rowSpaces.get(j + 1) - rowSpaces.get(j);
				for (int i = 0; i < columnSpaces.size(); i++) {
					// 交易日期#交易地点#交易方式#借贷状态#交易金额#余额
					int columnWidth = (i == columnSpaces.size() - 1) ?
							(int) page.getBleedBox().getWidth() - columnSpaces.get(i) :
							columnSpaces.get(i + 1) - columnSpaces.get(i);
					rectangle = new Rectangle(columnSpaces.get(i), rowSpaces.get(j)- (rowHeight/3), columnWidth, rowHeight);
					stripper.addRegion("regionName" + no, rectangle);
					no += 1;
				}
			}
			formatData(stripper, page, column, results);
			System.out.println("总数:" + results.size());
			no = 1;
		}
		return results;
	}

	public static void main(String[] args) throws Exception {

		AbstractBankBillAnalysis abstractBankBillAnalysis = new JiaoTongBankBillAnalysis();

		String filePathName;
		String filePassWord;
		List<String[]> results;

		long beginTime = System.currentTimeMillis();
		filePathName = "D:\\银行流水\\交通银行.pdf";
		filePassWord = "";
		results = abstractBankBillAnalysis.analysisBankBill(filePathName, filePassWord);
		System.out.println("耗时:" + (System.currentTimeMillis() - beginTime));
		FileUtils.deleteFile(filePathName.concat(".txt"));
		FileUtils.writeFile(filePathName.concat(".txt"), results, true, SEPARATOR);


	}

}
农业:
package 坐标;

import com.yunsax.common.fileutils.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.text.PDFTextStripperByArea;

import java.awt.*;
import java.util.ArrayList;
import java.util.List;

/**
 * @filename NongYeBankBillAnalysis
 * @description 农业银行流水解析
 * @author natsu
 * @date 2022/3/7 9:27
 */
public class NongYeBankBillAnalysis extends AbstractBankBillAnalysis {

	@Override
	protected List<String[]> analysisBankBillContent(PDDocument pdDocument) throws Exception {
		//获取目录
		PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
		//获取页码树
		PDPageTree tree = catalog.getPages();
		// 总页数
		int countPage = tree.getCount();
		int no = 1;
		// 列数
		int column = 9;
		// 每页总数
		int pageSum = 19;
		int pageWidth = 840;
		List<String[]> results = new ArrayList<>(countPage * pageSum);

		int startYColumn = 120;
		int startYRow = 125;
		for (int pageNo = 0; pageNo < countPage; pageNo++) {
			//获取指定页,从0开始
			PDPage page = tree.get(pageNo);
			// 计算X坐标
			List<Integer> columnSpaces = reckonColumnSpace(page, startYColumn, column, 0, pageWidth, 1);

			column = columnSpaces.size();

			// 计算Y坐标
			List<Integer> rowSpaces = reckonRowSpace(page, startYRow, pageSum, columnSpaces.get(0), 0,
					new String[] { "该" });
			//按区域读取文本剥离器
			PDFTextStripperByArea stripper = new PDFTextStripperByArea();
			Rectangle rectangle;
			for (int j = 0; j < rowSpaces.size(); j++) {
				int rowHeight = (j == rowSpaces.size() - 1) ? 20 : rowSpaces.get(j + 1) - rowSpaces.get(j);
				for (int i = 0; i < columnSpaces.size(); i++) {
					int columnWidth = (i == columnSpaces.size() - 1) ?
							pageWidth - columnSpaces.get(i) :
							columnSpaces.get(i + 1) - columnSpaces.get(i);
					rectangle = new Rectangle(columnSpaces.get(i), rowSpaces.get(j) - (rowHeight / 4), columnWidth,
							rowHeight + (rowHeight / 5));
					stripper.addRegion("regionName" + no, rectangle);
					no += 1;
				}
			}

			formatData(stripper, page, column, results);
			System.out.println("总数:" + results.size());
			no = 1;
			startYColumn = 35;
			startYRow = 40;
		}
		return results;
	}

	public static void main(String[] args) throws Exception {

		AbstractBankBillAnalysis abstractBankBillAnalysis = new NongYeBankBillAnalysis();

		String filePathName;
		String filePassWord;
		List<String[]> results;

		long beginTime = System.currentTimeMillis();
		filePathName = "D:\\银行流水\\农业银行.pdf";
		filePassWord = "";
		results = abstractBankBillAnalysis.analysisBankBill(filePathName, filePassWord);
		System.out.println("耗时:" + (System.currentTimeMillis() - beginTime));
		FileUtils.deleteFile(filePathName.concat(".txt"));
		FileUtils.writeFile(filePathName.concat(".txt"), results, true, SEPARATOR);

	}

}

 目前主要贴这几个解析代码,其实思路都区别不大。

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值