package 坐标;
import org.apache.commons.lang.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import java.awt.*;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* @filename AbBankBillAnalysis
* @description 银行流水解析抽象类
* @author natsu
* @date 2022/3/7 9:16
*/
public abstract class AbstractBankBillAnalysis {
public static final String SEPARATOR = "\001";
public static final String PDF_LINE_FEED = "\r\n";
/**
* 功能描述 解析银行流水
* @param filePathName 文件路径名称
* @param filePassWord 文件密码
* @author natsu
* @date 2022/3/7 9:32
*/
protected List<String[]> analysisBankBill(String filePathName, String filePassWord) throws Exception {
PDDocument pdDocument = readBankBillContent(filePathName, filePassWord);
return analysisBankBillContent(pdDocument);
}
/**
* 功能描述 读取银行流水内容
* @param filePathName 文件路径名称
* @param filePassWord 文件密码
* @return java.lang.String
* @author natsu
* @date 2022/3/7 9:19
*/
protected PDDocument readBankBillContent(String filePathName, String filePassWord) throws IOException {
//加载PDF文件
return PDDocument.load(new File(filePathName), filePassWord);
}
/**
* 功能描述 解析银行流水内容
* @param pdDocument 文档对象
* @return java.util.List<java.lang.String [ ]>
* @author natsu
* @date 2022/3/11 17:48
*/
protected abstract List<String[]> analysisBankBillContent(PDDocument pdDocument) throws Exception;
/**
* 功能描述 计算单页横坐标
* @param page 当前页对象
* @param startY 起始Y坐标
* @param column 列数
* @param startX 起始X坐标
* @param locators 起始X坐标
* @param pageWidth 页面宽度
* @return java.util.List<java.lang.Integer>
* @author natsu
* @date 2022/3/11 17:48
*/
public List<Integer> reckonColumnSpace(PDPage page, int startY, int column, int startX, String[] locators,
int pageWidth) throws IOException {
List<Integer> columnSpaces = new ArrayList<>(column);
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
for (int i = 0; i < pageWidth; i += 5) {
Rectangle rectangle = new Rectangle(i + startX, startY, 5, 5);
stripper.addRegion(String.valueOf(i + startX), rectangle);
}
//按位置进行排序
stripper.setSortByPosition(true);
//提取页面信息
stripper.extractRegions(page);
List<String> regions = stripper.getRegions();
for (int i = 0; i < regions.size(); i++) {
String result = stripper.getTextForRegion(regions.get(i));
if (StringUtils.isBlank(result))
continue;
for (String locator : locators) {
if (StringUtils.contains(result, locator)) {
// System.out.println(result.replace(PDF_LINE_FEED, "") + "=> , 横坐标:" + regions.get(i));
columnSpaces.add(Integer.valueOf(regions.get(i)));
break;
}
}
}
return columnSpaces;
}
/**
* 功能描述 计算单页横坐标并自动计算列数
* @param page 当前页对象
* @param startY 起始Y坐标
* @param column 列数
* @param startX 起始X坐标
* @param pageWidth 页面宽度
* @param spaceSum 空格总数
* @return java.util.List<java.lang.Integer>
* @author natsu
* @date 2022/3/11 17:48
*/
public List<Integer> reckonColumnSpace(PDPage page, int startY, int column, int startX, int pageWidth, int spaceSum)
throws IOException {
List<Integer> columnSpaces = new ArrayList<>(column);
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
for (int i = 0; i < pageWidth; i += 5) {
Rectangle rectangle = new Rectangle(i + startX, startY, 5, 5);
stripper.addRegion(String.valueOf(i + startX), rectangle);
}
//按位置进行排序
stripper.setSortByPosition(true);
//提取页面信息
stripper.extractRegions(page);
List<String> regions = stripper.getRegions();
boolean flag = true;
int count = 0;
for (int i = 0; i < regions.size(); i++) {
String result = stripper.getTextForRegion(regions.get(i));
// System.out.println(result.replace(PDF_LINE_FEED, "") + "=> , 横坐标:" + regions.get(i));
if (StringUtils.isBlank(result)) {
count++;
if (count > spaceSum) {
flag = true;
}
continue;
}
if (flag && StringUtils.isNotBlank(result)) {
// System.out.println(result.replace(PDF_LINE_FEED, "") + "=> , 横坐标:" + regions.get(i));
columnSpaces.add(Integer.valueOf(regions.get(i)));
flag = false;
}
count = 0;
}
return columnSpaces;
}
/**
* 功能描述 计算单页横坐标
* @param page 当前页对象
* @param startY 起始Y坐标
* @param rowNum 行数
* @param startX 起始X坐标
* @param removalHeight 去除高度
* @param excludeCharacters 排除字符
* @return java.util.List<java.lang.Integer>
* @author natsu
* @date 2022/3/11 17:48
*/
public List<Integer> reckonRowSpace(PDPage page, int startY, int rowNum, int startX, int removalHeight,
String[] excludeCharacters) throws IOException {
List<Integer> rowSpaces = new ArrayList<>(rowNum);
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
for (int i = 0; i < page.getArtBox().getHeight() - removalHeight; i++) {
Rectangle rectangle = new Rectangle(startX, startY + i, 20, 5);
stripper.addRegion(String.valueOf(i + startY), rectangle);
}
//按位置进行排序
stripper.setSortByPosition(true);
//提取页面信息
stripper.extractRegions(page);
List<String> regions = stripper.getRegions();
out:
for (int i = 0; i < regions.size(); i++) {
String result = stripper.getTextForRegion(regions.get(i));
if (StringUtils.isNotBlank(result)) {
for (String excludeCharacter : excludeCharacters) {
if (StringUtils.contains(result, excludeCharacter)) {
break out;
}
}
// System.out.println(result.replace(PDF_LINE_FEED, "") + "=> , 纵坐标:" + regions.get(i));
rowSpaces.add(Integer.valueOf(regions.get(i)));
}
}
return rowSpaces;
}
public void formatData(PDFTextStripperByArea stripper, PDPage page, int column, List<String[]> results)
throws IOException {
//按位置进行排序
stripper.setSortByPosition(true);
//提取页面信息
stripper.extractRegions(page);
List<String> regions = stripper.getRegions();
StringBuilder data = new StringBuilder();
for (int i = 0; i < regions.size(); i++) {
if (i != 0 && (i % column == 0)) {
System.out.println(data.toString().replace(PDF_LINE_FEED, ""));
results.add(data.toString().replace(PDF_LINE_FEED, "").split(SEPARATOR));
data = new StringBuilder();
data.append(stripper.getTextForRegion(regions.get(i)));
data.append(SEPARATOR);
} else {
data.append(stripper.getTextForRegion(regions.get(i)));
data.append(SEPARATOR);
if (i == regions.size() - 1) {
System.out.println(data.toString().replace(PDF_LINE_FEED, ""));
results.add(data.toString().replace(PDF_LINE_FEED, "").split(SEPARATOR));
}
}
}
}
}
广发:
package 坐标;
import com.yunsax.common.fileutils.FileUtils;
import com.yunsax.common.json.utils.GsonUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import java.awt.*;
import java.util.ArrayList;
import java.util.List;
/**
* @filename GuangFaBankBillAnalysis
* @description 广发银行流水解析
* @author natsu
* @date 2022/3/7 9:27
*/
public class GuangFaBankBillAnalysis extends AbstractBankBillAnalysis {
@Override
protected List<String[]> analysisBankBillContent(PDDocument pdDocument) throws Exception {
//获取目录
PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
//获取页码树
PDPageTree tree = catalog.getPages();
// 总页数
int countPage = tree.getCount();
int no = 1;
// 列数
int column = 9;
// 每页总数
int pageSum = 23;
int pageWidth = 840;
int startYColumn = 130;
int startYRow = 140;
List<String[]> results = new ArrayList<>(countPage * pageSum);
for (int pageNo = 0; pageNo < countPage; pageNo++) {
//获取指定页,从0开始
PDPage page = tree.get(pageNo);
// 计算X坐标
List<Integer> columnSpaces = reckonColumnSpace(page, startYColumn, column, 0, pageWidth, 2);
column = columnSpaces.size();
// 特殊处理
int columnSpace = columnSpaces.get(0) - columnSpaces.get(0) / 3 ;
columnSpaces.remove(0);
columnSpaces.add(0, columnSpace);
// 计算Y坐标
List<Integer> rowSpaces = reckonRowSpace(page, startYRow, pageSum, columnSpaces.get(0), 200,
new String[] { "期" });
columnSpace = columnSpaces.get(1) - (columnSpaces.get(1) - columnSpaces.get(0)) / 4;
columnSpaces.remove(1);
columnSpaces.add(1, columnSpace);
columnSpace = columnSpaces.get(2) - (columnSpaces.get(2) - columnSpaces.get(1)) / 4;
columnSpaces.remove(2);
columnSpaces.add(2, columnSpace);
columnSpace = columnSpaces.get(3) - (columnSpaces.get(3) - columnSpaces.get(2)) / 4;
columnSpaces.remove(3);
columnSpaces.add(3, columnSpace);
columnSpace = columnSpaces.get(4) - (columnSpaces.get(4) - columnSpaces.get(3)) / 3;
columnSpaces.remove(4);
columnSpaces.add(4, columnSpace);
columnSpace = columnSpaces.get(5) - (columnSpaces.get(5) - columnSpaces.get(4)) / 6;
columnSpaces.remove(5);
columnSpaces.add(5, columnSpace);
columnSpace = columnSpaces.get(6) - (columnSpaces.get(6) - columnSpaces.get(5)) / 5;
columnSpaces.remove(6);
columnSpaces.add(6, columnSpace);
//按区域读取文本剥离器
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
Rectangle rectangle;
for (int j = 0; j < rowSpaces.size(); j++) {
int rowHeight = (j == rowSpaces.size() - 1) ? 28 : rowSpaces.get(j + 1) - rowSpaces.get(j);
for (int i = 0; i < columnSpaces.size(); i++) {
// 交易日期#交易地点#交易方式#借贷状态#交易金额#余额
int columnWidth = (i == columnSpaces.size() - 1) ?
pageWidth - columnSpaces.get(i) :
columnSpaces.get(i + 1) - columnSpaces.get(i);
rectangle = new Rectangle(columnSpaces.get(i), rowSpaces.get(j) - (rowHeight / 3), columnWidth,
rowHeight);
stripper.addRegion("regionName" + no, rectangle);
no += 1;
}
}
formatData(stripper, page, column, results);
System.out.println("总数:" + results.size());
no = 1;
}
return results;
}
public static void main(String[] args) throws Exception {
AbstractBankBillAnalysis abstractBankBillAnalysis = new GuangFaBankBillAnalysis();
String filePathName;
String filePassWord;
List<String[]> results;
long beginTime = System.currentTimeMillis();
filePathName = "D:\\银行流水\\2.广发银行.pdf";
filePassWord = "密码";
results = abstractBankBillAnalysis.analysisBankBill(filePathName, filePassWord);
System.out.println("耗时:" + (System.currentTimeMillis() - beginTime));
FileUtils.deleteFile(filePathName.concat(".txt"));
FileUtils.writeFile(filePathName.concat(".txt"), results, true, SEPARATOR);
}
}
交通:
package 坐标;
import com.yunsax.common.fileutils.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import java.awt.*;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* @filename JiaoTongBankBillAnalysis
* @description 交通银行流水解析
* @author natsu
* @date 2022/3/7 9:27
*/
public class JiaoTongBankBillAnalysis extends AbstractBankBillAnalysis {
@Override
protected List<String[]> analysisBankBillContent(PDDocument pdDocument) throws Exception {
//获取目录
PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
//获取页码树
PDPageTree tree = catalog.getPages();
// 总页数
int countPage = tree.getCount();
int no = 1;
// 列数
int column = 0;
// 每页总数
int pageSum = 33;
int startYColumn = 210;
int startYRow = 220;
List<String[]> results = new ArrayList<>(countPage * pageSum);
for (int pageNo = 0; pageNo < countPage; pageNo++) {
//获取指定页,从0开始
PDPage page = tree.get(pageNo);
// 计算X坐标
List<Integer> columnSpaces = reckonColumnSpace(page, startYColumn, column, 40, 600, 4);
column = columnSpaces.size();
// 计算Y坐标
List<Integer> rowSpaces = reckonRowSpace(page, startYRow, pageSum, 40, 0, new String[]{"方发"});
//按区域读取文本剥离器
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
Rectangle rectangle;
for (int j = 0; j < rowSpaces.size(); j++) {
int rowHeight = (j == rowSpaces.size() - 1) ? 15 : rowSpaces.get(j + 1) - rowSpaces.get(j);
for (int i = 0; i < columnSpaces.size(); i++) {
// 交易日期#交易地点#交易方式#借贷状态#交易金额#余额
int columnWidth = (i == columnSpaces.size() - 1) ?
(int) page.getBleedBox().getWidth() - columnSpaces.get(i) :
columnSpaces.get(i + 1) - columnSpaces.get(i);
rectangle = new Rectangle(columnSpaces.get(i), rowSpaces.get(j)- (rowHeight/3), columnWidth, rowHeight);
stripper.addRegion("regionName" + no, rectangle);
no += 1;
}
}
formatData(stripper, page, column, results);
System.out.println("总数:" + results.size());
no = 1;
}
return results;
}
public static void main(String[] args) throws Exception {
AbstractBankBillAnalysis abstractBankBillAnalysis = new JiaoTongBankBillAnalysis();
String filePathName;
String filePassWord;
List<String[]> results;
long beginTime = System.currentTimeMillis();
filePathName = "D:\\银行流水\\交通银行.pdf";
filePassWord = "";
results = abstractBankBillAnalysis.analysisBankBill(filePathName, filePassWord);
System.out.println("耗时:" + (System.currentTimeMillis() - beginTime));
FileUtils.deleteFile(filePathName.concat(".txt"));
FileUtils.writeFile(filePathName.concat(".txt"), results, true, SEPARATOR);
}
}
农业:
package 坐标;
import com.yunsax.common.fileutils.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import java.awt.*;
import java.util.ArrayList;
import java.util.List;
/**
* @filename NongYeBankBillAnalysis
* @description 农业银行流水解析
* @author natsu
* @date 2022/3/7 9:27
*/
public class NongYeBankBillAnalysis extends AbstractBankBillAnalysis {
@Override
protected List<String[]> analysisBankBillContent(PDDocument pdDocument) throws Exception {
//获取目录
PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
//获取页码树
PDPageTree tree = catalog.getPages();
// 总页数
int countPage = tree.getCount();
int no = 1;
// 列数
int column = 9;
// 每页总数
int pageSum = 19;
int pageWidth = 840;
List<String[]> results = new ArrayList<>(countPage * pageSum);
int startYColumn = 120;
int startYRow = 125;
for (int pageNo = 0; pageNo < countPage; pageNo++) {
//获取指定页,从0开始
PDPage page = tree.get(pageNo);
// 计算X坐标
List<Integer> columnSpaces = reckonColumnSpace(page, startYColumn, column, 0, pageWidth, 1);
column = columnSpaces.size();
// 计算Y坐标
List<Integer> rowSpaces = reckonRowSpace(page, startYRow, pageSum, columnSpaces.get(0), 0,
new String[] { "该" });
//按区域读取文本剥离器
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
Rectangle rectangle;
for (int j = 0; j < rowSpaces.size(); j++) {
int rowHeight = (j == rowSpaces.size() - 1) ? 20 : rowSpaces.get(j + 1) - rowSpaces.get(j);
for (int i = 0; i < columnSpaces.size(); i++) {
int columnWidth = (i == columnSpaces.size() - 1) ?
pageWidth - columnSpaces.get(i) :
columnSpaces.get(i + 1) - columnSpaces.get(i);
rectangle = new Rectangle(columnSpaces.get(i), rowSpaces.get(j) - (rowHeight / 4), columnWidth,
rowHeight + (rowHeight / 5));
stripper.addRegion("regionName" + no, rectangle);
no += 1;
}
}
formatData(stripper, page, column, results);
System.out.println("总数:" + results.size());
no = 1;
startYColumn = 35;
startYRow = 40;
}
return results;
}
public static void main(String[] args) throws Exception {
AbstractBankBillAnalysis abstractBankBillAnalysis = new NongYeBankBillAnalysis();
String filePathName;
String filePassWord;
List<String[]> results;
long beginTime = System.currentTimeMillis();
filePathName = "D:\\银行流水\\农业银行.pdf";
filePassWord = "";
results = abstractBankBillAnalysis.analysisBankBill(filePathName, filePassWord);
System.out.println("耗时:" + (System.currentTimeMillis() - beginTime));
FileUtils.deleteFile(filePathName.concat(".txt"));
FileUtils.writeFile(filePathName.concat(".txt"), results, true, SEPARATOR);
}
}
目前主要贴这几个解析代码,其实思路都区别不大。